R/save_functions.r

Defines functions .resolve_ris_tag .norm_space .reverse_map_ris_tags .get_reverse_ris_tag_map .map_ris_tags save_dataframe_to_ris read_ris_to_dataframe

Documented in read_ris_to_dataframe save_dataframe_to_ris

#' Read an RIS file into a data frame
#'
#' Parses an RIS file into a data.frame, preserving the order of tags as they
#' first appear in the file. Repeated tags within a record are collapsed into a
#' single semicolon-separated string.
#'
#' @param file_path character. Path to the RIS file to read.
#'
#' @return A data.frame with one row per record and one column per encountered
#'   RIS tag, using descriptive column names. Columns are ordered by first appearance of the
#'   tag in the file. Repeated tag values are collapsed with "; ".
#'
#' @examples
#' \dontrun{
#' df <- read_ris_to_dataframe("data-raw/raw data/apa_psycinfo_test_data.ris")
#' }
#'
#' @export
read_ris_to_dataframe <- function(file_path) {
  lines <- readLines(file_path, encoding = "UTF-8")

  preallocate_size <- max(1L, length(lines) %/% 5L + 1L) # Rough estimate: average 5 lines per record (varies widely), so preallocate for that many records to improve performance
  records <- vector("list", preallocate_size) # To store parsed records as lists of fields
  record_order <- vector("list", preallocate_size) # To track the order of tags for each record
  rec_idx <- 0L # Current record index
  current_record <- list() # Current record being processed
  current_order <- character(0) # Order of tags in the current record
  field_order <- character(0) # Order of all encountered tags
  last_field <- NULL # Last field processed
  after_end_record <- FALSE # Flag indicating if the last line was an end-of-record marker
  for (raw_line in lines) {
    line <- sub("[\r\n]+$", "", raw_line) # Remove trailing newlines
    if (trimws(line) == "") next  

    if (grepl("^ER[[:space:]]+-[[:space:]]*$", line)) {
      if (length(current_record) > 0) {
        rec_idx <- rec_idx + 1L # Move to next record index
        records[[rec_idx]] <- current_record # Save the current record
        record_order[[rec_idx]] <- current_order # Save the order of tags for this record
      }
      current_record <- list() # Reset the current record
      current_order <- character(0) # Reset the order of tags for the new record
      last_field <- NULL # Reset last field
      after_end_record <- TRUE # Set flag to indicate we just finished a record
    } else if (grepl("^[^[:space:]]+[[:space:]]+-[[:space:]]", line)) { # Matches lines that start with a tag followed by "  - "
      field <- sub("^([^[:space:]]+)[[:space:]]+-[[:space:]].*$", "\\1", line) # Extract the tag (field name)
      value <- .norm_space(sub("^[^[:space:]]+[[:space:]]+-[[:space:]]", "", line)) # Extract the value and normalize whitespace

      if ((rec_idx == 0L && length(current_record) == 0) || after_end_record) {
        if (length(current_record) > 0) {
          rec_idx <- rec_idx + 1L
          records[[rec_idx]] <- current_record
          record_order[[rec_idx]] <- current_order
        }
        current_record <- list() # Reset the current record
        current_order <- character(0) # Reset the order of tags for the new record
        after_end_record <- FALSE # Reset flag
      }

      if (!field %in% field_order) field_order <- c(field_order, field) # if this is the first time we've seen this field, add it to the overall field order
      current_order <- c(current_order, field)

      if (field %in% names(current_record)) {
        current_record[[field]] <- if (is.list(current_record[[field]])) { # If it's already a list, append the new value
          append(current_record[[field]], value) 
        } else {
          list(current_record[[field]], value) # If it's not a list, create a list with the existing value and the new value
        }
      } else {
        current_record[[field]] <- value # First time we've seen this field in the current record, just set it
      }
      last_field <- field
    } else if (!is.null(last_field) && last_field %in% names(current_record)) {
      cont <- .norm_space(if (grepl("^[[:space:]]{2}", line)) sub("^[[:space:]]{2}", "", line) else line) # Continuation lines should start with at least 2 spaces
      curval <- current_record[[last_field]] # Get the current value for the last field
      if (is.list(curval)) {
        n <- length(curval) # Append the continuation to the last value in the list
        curval[[n]] <- .norm_space(paste(curval[[n]], cont)) # Normalize whitespace and append the continuation to the last value in the list
        current_record[[last_field]] <- curval # Update the record with the modified list
      } else {
        current_record[[last_field]] <- .norm_space(paste(curval, cont)) # Normalize whitespace and append the continuation to the current value
      }
    }
  }

  if (length(current_record) > 0) {
    rec_idx <- rec_idx + 1L # Save the last record if we ended without an ER tag
    records[[rec_idx]] <- current_record # Save the last record
    record_order[[rec_idx]] <- current_order # Save the order of tags for the last record
  }
  if (rec_idx == 0L) return(data.frame()) # If no records were found, return an empty data frame

  records <- records[seq_len(rec_idx)] # Trim the records list to the actual number of records found
  record_order <- record_order[seq_len(rec_idx)] # Trim the record order list to the actual number of records found

  df_list <- vector("list", length(field_order)) # Preallocate a list of vectors for each field, ordered by first appearance in the file
  names(df_list) <- field_order
  raw_values <- vector("list", length(field_order)) # Preallocate a list of raw values for each field
  names(raw_values) <- field_order

  for (field in field_order) {
    df_list[[field]] <- character(rec_idx) # Initialize the data frame list with empty character vectors for each field
    raw_values[[field]] <- vector("list", rec_idx) # Initialize the raw values list with empty lists for each field to store raw values per record
  }

  for (i in seq_len(rec_idx)) {
    for (field in names(records[[i]])) { 
      value <- records[[i]][[field]] # Get the value for this field in the current record
      if (is.list(value)) {
        vals <- .norm_space(unlist(value)) # Normalize whitespace for each value in the list
        df_list[[field]][i] <- paste(vals, collapse = "; ") # Collapse multiple values with "; " for the data frame
        raw_values[[field]][[i]] <- vals # Store the raw values as a list for this field and record
      } else {
        df_list[[field]][i] <- .norm_space(value) # Normalize whitespace for the single value and store it in the data frame
        raw_values[[field]][[i]] <- df_list[[field]][i] # Store the raw value (which is the same as the normalized value in this case) for this field and record
      }
    }
  }

  df <- data.frame(df_list, stringsAsFactors = FALSE, check.names = FALSE)
  attr(df, "ris_raw_values") <- raw_values
  attr(df, "ris_field_order") <- field_order
  attr(df, "ris_record_order") <- record_order
  .map_ris_tags(df)
}

#' Write a data frame to a RIS file
#'
#' Writes a data.frame to a RIS file, one record per row. If the data frame was created
#' by \code{read_ris_to_dataframe()}, the original RIS tag order and tags are preserved where possible.
#' Otherwise, a standard RIS format is used.
#'
#' If a field value contains semicolons, it is split and written as multiple tag lines. The \code{TY}
#' (source type) field is written first for each record, followed by all other fields. Records are
#' terminated with \code{ER  - }.
#'
#' @param df data.frame. The data to write.
#' @param file_path character. Path to the output RIS file.
#'
#' @return A character string indicating the file path where the RIS file was saved.
#'
#' @examples
#' \dontrun{
#' df <- read_ris_to_dataframe("data-raw/raw data/apa_psycinfo_test_data.ris")
#' save_dataframe_to_ris(df, "path/to/output.ris")
#' }
#'
#' @export
save_dataframe_to_ris <- function(df, file_path) {
  tag_meta <- attr(df, "ris_tag_used", exact = TRUE) # Metadata about which original tag was used for each column
  raw_meta <- attr(df, "ris_raw_values", exact = TRUE) # Raw values per tag
  record_order <- attr(df, "ris_record_order", exact = TRUE) # Original record order
  has_meta <- is.list(tag_meta) && length(tag_meta) > 0 # Check if metadata exists
  has_raw <- is.list(raw_meta) && length(raw_meta) > 0 # Check if raw values exist
  has_record_order <- is.list(record_order) && length(record_order) == nrow(df) # Check if record order exists
  reverse_tag_map <- .get_reverse_ris_tag_map() # Get reverse mapping of descriptive names to RIS tags

  if (!has_meta) df <- .reverse_map_ris_tags(df) # Reverse map if no metadata

  # Attempt to preserve original formatting if possible
  if (has_raw && has_record_order) {
    raw_tags <- names(raw_meta) # Get all RIS tags from raw metadata
    raw_df_list <- vector("list", length(raw_tags))
    names(raw_df_list) <- raw_tags
    for (tag in raw_tags) raw_df_list[[tag]] <- character(nrow(df))

    # Fill raw_df_list with normalized raw values
    for (i in seq_len(nrow(df))) {
      for (tag in raw_tags) {
        vals <- unlist(raw_meta[[tag]][[i]])
        vals <- vals[!is.na(vals) & vals != ""]
        if (length(vals) > 0) {
          raw_df_list[[tag]][i] <- .norm_space(paste(.norm_space(vals), collapse = "; ")) # Collapse multiple values with "; "
        }
      }
    }

    # Reconstruct data frame from raw values and compare with original
    mapped_raw_df <- .map_ris_tags(data.frame(raw_df_list, stringsAsFactors = FALSE)) 
    common_cols <- intersect(names(mapped_raw_df), names(df)) # Find common columns between mapped raw and original df
    identical_on_common <- all(sapply(common_cols, function(col) { # Check if values are identical for common columns
      # Compare normalized values for each common column
      a <- as.character(mapped_raw_df[[col]])
      b <- as.character(df[[col]])
      a[is.na(a)] <- ""
      b[is.na(b)] <- ""
      identical(.norm_space(a), .norm_space(b))
    }))

    if (identical_on_common) { # If identical, write using original raw formatting
      con <- file(file_path, "w", encoding = "UTF-8")
      on.exit(close(con), add = TRUE)

      for (i in seq_len(nrow(df))) {
        tag_pos <- list()
        for (tag in record_order[[i]]) {
          if (tag == "ER" || is.null(raw_meta[[tag]])) next
          vals <- unlist(raw_meta[[tag]][[i]])
          vals <- vals[!is.na(vals) & vals != ""]
          if (length(vals) == 0) next
          pos <- if (is.null(tag_pos[[tag]])) 1L else tag_pos[[tag]]
          if (pos <= length(vals)) {
            writeLines(paste0(tag, "  - ", .norm_space(vals[[pos]])), con)
            tag_pos[[tag]] <- pos + 1L
          }
        }
        writeLines("ER  - ", con)
        writeLines("", con)
      }
      cat("RIS file saved to:", file_path, "\n")
      return(invisible(file_path))
    }
  }
  # Fallback: write using normalized values from df
  con <- file(file_path, "w", encoding = "UTF-8")
  multi_value_tags <- c("AU","A1","KW","M1","T1","TI","SN","JF","JO","JA","Y1","PY",
                        "ID","AN","UR","DO","DP","DB","AD","PB","VL","IS","SP","EP","U1","U2","CN")
  free_text_tags <- c("AB","N1","NT","N2","NO","U1") # Tags that may contain semicolons as part of free text
  ty_col <- if ("TY" %in% names(df)) "TY" else if ("source_type" %in% names(df)) "source_type" else NULL # Determine TY column

  # Write each record
  for (i in seq_len(nrow(df))) {
    row <- df[i, , drop = FALSE]

    # Write TY field first
    if (!is.null(ty_col)) {
      ty_val <- as.character(row[[ty_col]])
      if (!is.na(ty_val) && ty_val != "") {
        writeLines(paste0(.resolve_ris_tag(ty_col, i, tag_meta = tag_meta, reverse_tag_map = reverse_tag_map), "  - ", .norm_space(ty_val)), con)
      }
    }

    # Write other fields
    for (col_name in names(row)) {
      if (!is.null(ty_col) && col_name == ty_col) next
      value <- row[[col_name]]
      if (is.factor(value) || is.list(value) || length(value) > 1) {
        value <- paste(unlist(value), collapse = "; ")
      }
      value <- as.character(value)
      if (is.na(value) || value == "") next
      value <- .norm_space(value)
      tag_to_write <- .resolve_ris_tag(col_name, i, tag_meta = tag_meta, reverse_tag_map = reverse_tag_map)

      # Attempt to write from raw values if they match
      wrote_from_raw <- FALSE
      if (has_raw && !is.null(raw_meta[[tag_to_write]])) {
        raw_vals <- unlist(raw_meta[[tag_to_write]][[i]])
        raw_vals <- raw_vals[!is.na(raw_vals) & raw_vals != ""]
        if (length(raw_vals) > 0 && identical(.norm_space(value), .norm_space(paste(.norm_space(raw_vals), collapse = "; ")))) {
          for (rv in .norm_space(raw_vals)) {
            if (rv != "") writeLines(paste0(tag_to_write, "  - ", rv), con)
          }
          wrote_from_raw <- TRUE
        }
      }

      # Fallback: write normalized value
      if (!wrote_from_raw) {
        if (grepl(";", value) && tag_to_write %in% multi_value_tags && !(tag_to_write %in% free_text_tags)) {
          for (val in strsplit(value, ";\\s*")[[1]]) {
            val <- trimws(val)
            if (val != "") writeLines(paste0(tag_to_write, "  - ", val), con)
          }
        } else {
          writeLines(paste0(tag_to_write, "  - ", value), con)
        }
      }
    }
    writeLines("ER  - ", con)
    writeLines("", con)
  }
  close(con)
  cat("RIS file saved to:", file_path, "\n")
}


# Helper function to map RIS tags to descriptive names
.map_ris_tags <- function(df) {
  # Preserve read-time metadata.
  raw_values <- attr(df, "ris_raw_values", exact = TRUE)
  field_order <- attr(df, "ris_field_order", exact = TRUE)
  record_order <- attr(df, "ris_record_order", exact = TRUE)

  tag_map <- c(
    DB = "database",
    FN = "file_name",
    N = "file_name",
    DA = "date_generated",
    DT = "document_type",
    M3 = "document_type",
    TY = "source_type",
    LT = "publication_type",
    PT = "publication_type",
    LA = "language",
    PST = "status",
    SA = "street_address",
    STAT = "status",
    A1 = "author",
    A2 = "author",
    A3 = "author",
    A4 = "author",
    A5 = "author",
    AU = "author",
    AF = "author_full",
    FAU = "author_full",
    BA = "author_book",
    BF = "author_book_full",
    CA = "author_group",
    GP = "author_group",
    CN = "author_corporate",
    CNx = "author_corporate",
    Z2 = "author_otherlang",
    IR = "investigator",
    IV = "investigator",
    FIR = "investigator_full",
    A2x = "editor",
    BE = "editor",
    ED = "editor",
    EDx = "editor",
    FED = "editor_full",
    EM = "email",
    OI = "orcid_id",
    AD = "address",
    ADx = "address",
    C1 = "address",
    M1 = "address",
    CY = "address",
    IRAD = "address",
    RP = "reprint_address",
    PS = "personal_name_as_subject",
    FPS = "personal_name_as_subject_full",
    PSx = "personal_name_as_subject",
    RI = "researcher_id",
    AUID = "author_id",
    PY = "year",
    Y1 = "year",
    EY = "year_early_access",
    CRDT = "date_created",
    RC = "date_created",
    LR = "date_revised",
    DCOM = "date_completed",
    EDAT = "date_added",
    EA = "date_early_access",
    DEP = "date_published_elec",
    DP = "database_provider",
    PD = "date_published",
    PHST = "publication_history_status",
    T1 = "title",
    TI = "title",
    BTI = "title_book",
    FT = "title_foreign",
    Z1 = "title_otherlang",
    TT = "title_transliterated",
    JA = "journal_ja",
    JF = "journal_jf",
    JO = "journal_jo",
    JT = "journal_jt",
    T3 = "journal_t3",
    TA = "journal_abbreviated",
    SO = "source",
    T2 = "source",
    J2 = "source_abbreviated",
    J9 = "source_abbreviated",
    JI = "source_abbreviated",
    Z3 = "source_otherlang",
    SI = "secondary_source_id",
    C3 = "custom3",
    VTI = "volume_title",
    SE = "book_series_title",
    BS = "book_series_subtitle",
    ED = "edition",
    EN = "edition",
    CTI = "collection_title",
    VI = "volume",
    VL = "volume",
    IP = "issue",
    IS = "issue",
    SI = "special_issue",
    AR = "article_number",
    C7 = "custom7",
    SU = "supplement",
    PG = "pages",
    BP = "start_page",
    SP = "start_page",
    EP = "end_page",
    PG = "n_pages",
    PS = "n_pages",
    P2 = "n_chapters",
    AB = "abstract",
    N2 = "abstract",
    A2x = "abstract_other",
    OAB = "abstract_other",
    Z4 = "abstract_otherlang",
    DE = "keywords",
    KW = "keywords",
    MI = "keywords",
    ID = "record_id",
    MH = "mesh_terms",
    MHDA = "mesh_date",
    VR = "version",
    DI = "doi",
    DO = "doi",
    L3 = "doi",
    D2 = "doi_book",
    BN = "isbn",
    ISBN = "isbn",
    ISx = "issn",
    SN = "issn",
    EI = "eissn",
    CN = "call_number",
    AID = "article_id",
    ID = "article_id",
    MID = "manuscript_id",
    C2 = "pubmed_id",
    PM = "pubmed_id",
    PMID = "pubmed_id",
    PMC = "pubmed_central_identitfier",
    PMCR = "pubmed_central_release",
    JC = "nlm_id",
    JID = "nlm_id",
    OID = "other_id",
    RN = "registry_number",
    UT = "accession_number",
    AN = "accession_nr",
    GA = "document_delivery_id",
    UR = "url",
    AW = "url",
    SF = "space_flight_mission",
    SFM = "space_flight_mission",
    PE = "published_elec",
    CT = "conference_name",
    HO = "conference_host",
    CL = "conference_location",
    CYx = "conference_location",
    CY = "conference_date",
    Y2 = "conference_date",
    A4x = "conference_sponsor",
    SPx = "conference_sponsor",
    MA = "meeting_abstract",
    PB = "publisher",
    PU = "publisher",
    PA = "place_published",
    PI = "place_published",
    PL = "place_published",
    PP = "place_published",
    AE = "patent_assignee",
    PN = "patent_number",
    CI = "copyright_info",
    COI = "conflict_of_interest",
    OCI = "copyright_info_other",
    GN = "gene_name",
    GS = "gene_symbol",
    N1 = "notes1",
    OA = "open_access",
    PUBM = "publishing_model",
    OWN = "owner",
    OB = "record_owner",
    DP = "data_provider",
    FU = "funding_agency",
    FX = "funding_text",
    GR = "grant_number",
    GI = "grant_information",
    LID = "location_id",
    OT = "term_other",
    OTO = "term_owner_other",
    SB = "subset",
    NM = "substance_name",
    WC = "wos_categories",
    SC = "research_areas",
    CH = "chemicals",
    DS = "diseases",
    PR = "parts",
    OR = "systematics",
    MQ = "methods",
    ME = "medium",
    GE = "geographic_data",
    TM = "geologic_data",
    SD = "sequence_data",
    TAx = "taxonomic_data",
    ST = "study_abbr",
    TN = "taxa_notes",
    BD = "concepts",
    CC = "concepts",
    MC = "concepts",
    TC = "n_cited",
    Z9 = "n_cited_allwos",
    Z8 = "n_cited_csc",
    ZB = "n_cited_biosis",
    HC = "esi_highly_cited",
    HP = "esi_hot_paper",
    U2 = "custom2",
    NR = "references_n",
    RF = "references_n",
    CR = "cited_references",
    SS = "citation_subset",
    NT = "notes",
    NO = "comments",
    ER = "end_record",
    EF = "end_file"
  )

  mapped_names <- sapply(seq_along(names(df)), function(i) {
    original_name <- names(df)[i]
    if (original_name == "U1") {
      non_empty <- df[[i]][!is.na(df[[i]]) & df[[i]] != ""]
      # If all values in the U1 column are numeric, map to eppi_id, else custom_info
      return(if (length(non_empty) > 0 && all(grepl("^[0-9]+$", non_empty))) "eppi_id" else "custom_info")
    }
    if (original_name %in% names(tag_map)) tag_map[[original_name]] else original_name
  })
  
  result_df <- data.frame(matrix(ncol = 0, nrow = nrow(df)))
  tag_used <- list() # To track which original tag was used for each column
  
  # Process each unique mapped name
  for (base_name in unique(mapped_names)) {
    matching_indices <- which(mapped_names == base_name)
    
    # If only one matching column, copy it directly
    if (length(matching_indices) == 1) {
      result_df[[base_name]] <- df[[matching_indices]]
      tag_used[[base_name]] <- rep(names(df)[matching_indices], nrow(df))
      # If multiple matching columns, handle repeated fields
    } else {
      result_df[[base_name]] <- character(nrow(df))
      used_col_per_row <- integer(nrow(df))
      base_tag_per_row <- character(nrow(df))
      
      for (row_idx in seq_len(nrow(df))) {
        for (col_idx in matching_indices) {
          val <- df[row_idx, col_idx]
          if (!is.na(val) && val != "") {
            result_df[row_idx, base_name] <- val
            used_col_per_row[row_idx] <- col_idx
            base_tag_per_row[row_idx] <- names(df)[col_idx]
            break
          }
        }
      }
      tag_used[[base_name]] <- base_tag_per_row # Track which column was used for base name
      
      counter <- 2 # Start counter for repeated fields
      # Handle additional columns for repeated fields
      for (col_idx in matching_indices) {
        col_name <- paste0(base_name, counter)
        result_df[[col_name]] <- character(nrow(df))
        col_tag_vec <- character(nrow(df))
        added_any <- FALSE
        
        # Fill in values for this repeated field
        for (row_idx in seq_len(nrow(df))) {
          if (used_col_per_row[row_idx] == col_idx) next
          val <- df[row_idx, col_idx]
          base_val <- result_df[row_idx, base_name]
          if (!is.na(val) && val != "" && base_val != "" && val != base_val) {
            result_df[row_idx, col_name] <- val
            col_tag_vec[row_idx] <- names(df)[col_idx]
            added_any <- TRUE
          }
        }
        
        # If any values were added, keep this column; else remove it
        if (added_any) {
          tag_used[[col_name]] <- col_tag_vec
          counter <- counter + 1
        } else {
          result_df[[col_name]] <- NULL
        }
      }
    }
  }

  attr(result_df, "ris_tag_used") <- tag_used
  if (is.list(raw_values)) attr(result_df, "ris_raw_values") <- raw_values
  if (!is.null(field_order)) attr(result_df, "ris_field_order") <- field_order
  if (is.list(record_order)) attr(result_df, "ris_record_order") <- record_order
  result_df
}

# Helper function: reverse mapping from descriptive names to RIS tags
.get_reverse_ris_tag_map <- function() {
  # Using the most common/standard tag for each descriptive name
  c(
    database = "DB",
    file_name = "FN",
    date_generated = "DA",
    document_type = "M3",
    source_type = "TY",
    publication_type = "PT",
    language = "LA",
    status = "PST",
    street_address = "SA",
    author = "AU",
    author_full = "AF",
    author_book = "BA",
    author_book_full = "BF",
    author_group = "CA",
    author_corporate = "CN",
    author_otherlang = "Z2",
    investigator = "IR",
    investigator_full = "FIR",
    editor = "ED",
    editor_full = "FED",
    email = "EM",
    orcid_id = "OI",
    address = "AD",
    reprint_address = "RP",
    personal_name_as_subject = "PS",
    personal_name_as_subject_full = "FPS",
    researcher_id = "RI",
    author_id = "AUID",
    year = "PY",
    year_early_access = "EY",
    date_created = "CRDT",
    date_revised = "LR",
    date_completed = "DCOM",
    date_added = "EDAT",
    date_early_access = "EA",
    date_published_elec = "DEP",
    database_provider = "DP",
    date_published = "PD",
    publication_history_status = "PHST",
    title = "TI",
    title_book = "BTI",
    title_foreign = "FT",
    title_otherlang = "Z1",
    title_transliterated = "TT",
    journal = "JO",
    journal_ja = "JA",
    journal_jf = "JF",
    journal_jo = "JO",
    journal_jt = "JT",
    journal_t3 = "T3",
    journal_abbreviated = "TA",
    source = "T2",
    source_abbreviated = "J2",
    source_otherlang = "Z3",
    secondary_source_id = "SI",
    custom3 = "C3",
    volume_title = "VTI",
    book_series_title = "SE",
    book_series_subtitle = "BS",
    edition = "EN",
    collection_title = "CTI",
    volume = "VL",
    issue = "IS",
    special_issue = "SI",
    article_number = "AR",
    custom7 = "C7",
    supplement = "SU",
    pages = "PG",
    start_page = "SP",
    end_page = "EP",
    n_pages = "PG",
    n_chapters = "P2",
    abstract = "AB",
    abstract_other = "OAB",
    abstract_otherlang = "Z4",
    keywords = "KW",
    record_id = "ID",
    mesh_terms = "MH",
    mesh_date = "MHDA",
    version = "VR",
    doi = "DO",
    doi_book = "D2",
    isbn = "ISBN",
    issn = "SN",
    eissn = "EI",
    call_number = "CN",
    article_id = "AID",
    manuscript_id = "MID",
    pubmed_id = "PMID",
    pubmed_central_identitfier = "PMC",
    pubmed_central_release = "PMCR",
    nlm_id = "JID",
    other_id = "OID",
    registry_number = "RN",
    accession_number = "UT",
    accession_nr = "AN",
    document_delivery_id = "GA",
    url = "UR",
    space_flight_mission = "SFM",
    published_elec = "PE",
    conference_name = "CT",
    conference_host = "HO",
    conference_location = "CL",
    conference_date = "Y2",
    conference_sponsor = "SP",
    meeting_abstract = "MA",
    publisher = "PB",
    place_published = "PP",
    patent_assignee = "AE",
    patent_number = "PN",
    copyright_info = "CI",
    conflict_of_interest = "COI",
    copyright_info_other = "OCI",
    gene_name = "GN",
    gene_symbol = "GS",
    notes1 = "N1",
    open_access = "OA",
    publishing_model = "PUBM",
    owner = "OWN",
    record_owner = "OB",
    data_provider = "DP",
    funding_agency = "FU",
    funding_text = "FX",
    grant_number = "GR",
    grant_information = "GI",
    location_id = "LID",
    term_other = "OT",
    term_owner_other = "OTO",
    subset = "SB",
    substance_name = "NM",
    wos_categories = "WC",
    research_areas = "SC",
    chemicals = "CH",
    diseases = "DS",
    parts = "PR",
    systematics = "OR",
    methods = "MQ",
    medium = "ME",
    geographic_data = "GE",
    geologic_data = "TM",
    sequence_data = "SD",
    taxonomic_data = "TA",
    study_abbr = "ST",
    taxa_notes = "TN",
    concepts = "CC",
    n_cited = "TC",
    n_cited_allwos = "Z9",
    n_cited_csc = "Z8",
    n_cited_biosis = "ZB",
    esi_highly_cited = "HC",
    esi_hot_paper = "HP",
    custom2 = "U2",
    references_n = "NR",
    cited_references = "CR",
    citation_subset = "SS",
    notes = "NT",
    comments = "NO",
    end_record = "ER",
    end_file = "EF",
    eppi_id = "U1",
    custom_info = "U1"
  )
}

.reverse_map_ris_tags <- function(df) {
  reverse_tag_map <- .get_reverse_ris_tag_map()
  result_df <- data.frame(matrix(ncol = 0, nrow = nrow(df)))
  
  # Process each column in the input data frame and map back to RIS tags
  for (col_name in names(df)) {
    base_name <- sub("[0-9]+$", "", col_name)
    ris_tag <- if (base_name %in% names(reverse_tag_map)) {
      reverse_tag_map[[base_name]]
    } else if (col_name %in% names(reverse_tag_map)) {
      reverse_tag_map[[col_name]]
    } else {
      col_name
    }
    result_df[[ris_tag]] <- df[[col_name]]
  }
  result_df
}

# Helper function: normalize whitespace consistently across read/write.
.norm_space <- function(x) gsub("[[:space:]]+", " ", trimws(x))

# Helper function: decide which RIS tag to write for a given column and row.
.resolve_ris_tag <- function(col_name, row_idx, tag_meta = NULL, reverse_tag_map = NULL) {
  if (is.list(tag_meta) && !is.null(tag_meta[[col_name]]) && # if metadata exists for this column
      length(tag_meta[[col_name]]) >= row_idx && # and has entry for this row
      !is.na(tag_meta[[col_name]][[row_idx]]) && # and is not NA
      nzchar(tag_meta[[col_name]][[row_idx]])) { # and is not empty
    return(tag_meta[[col_name]][[row_idx]]) # use the original RIS tag used
  }
  if (grepl("^[A-Z0-9]{2,4}$", col_name)) return(col_name) # already a RIS tag
  base_name <- sub("[0-9]+$", "", col_name) # strip trailing digits for repeated fields
  if (!is.null(reverse_tag_map) && base_name %in% names(reverse_tag_map)) return(reverse_tag_map[[base_name]]) # if mapped, use that
  col_name
}

Try the AIscreenR package in your browser

Any scripts or data that you put into this service are public.

AIscreenR documentation built on April 14, 2026, 1:08 a.m.