R/read.R

Defines functions parse_line_group split_into_line_groups extract_metadata warn_if_not_po_file is_po_file read_po empty_countable_msgs empty_direct_msgs match_and_extract

Documented in read_po

RX <- list(
  # "capture(some letters or hyphens): capture(some stuff)\n"
  metadata                 = '^"([a-zA-Z-]+): *(.+)\\\\n"$',
  # optional(obsolete comment) msgid "capture(some stuff)"
  msgid                    = '^(#~)? *msgid *"(.+)"$',
  # optional(obsolete comment) msgid_plural "capture(some stuff)"
  msgid_plural             = '^(?:#~)? *msgid_plural *"(.+)"$',
  # optional(obsolete comment) msgstr "capture(some stuff)"
  msgstr_direct            = '^(?:#~)? *msgstr *" *(.*)"$',
  # optional(obsolete comment) msgstr[index] "capture(some stuff)"
  msgstr_countable         = '^(?:#~)? *msgstr(?:\\[([0-9]+)\\])? *"(.*)"$',
  # optional(obsolete comment) msgctxt "capture(some stuff)"
  msgctxt                  = '^(?:#~)? *msgctxt *"(.+)"$',
  # hash, not (caret or comma or colon or pipe or tilde) capture(some stuff)
  translator_comment       = "^#(?=[^,:|~]) *(.+)$",
  # hash, colon capture(some stuff)
  source_reference_comment = "^#: *(.+)$",
  # hash, comma capture(some stuff)
  flags_comment            = "^#, *(.+)$",
  # hash, pipe capture(some stuff)
  previous_string_comment  = "^#\\| *(.+)$",
  # maybe some spaces
  blank                    = "^ *$"
)

# avoid warning in R RMCD check
utils::globalVariables(".")

#' @importFrom magrittr extract
#' @importFrom stringi stri_match_first_regex
match_and_extract <- function(x, rx, drop = TRUE)
{
  x %>%
    stri_match_first_regex(rx) %>%
    extract(!is.na(.[, 1L]), -1L, drop = drop)
}

empty_direct_msgs <- function() {
  tibble(
    msgid                     = character(),
    msgstr                    = character(),
    is_obsolete               = logical(),
    msgctxt                   = list(),
    translator_comments       = list(),
    source_reference_comments = list(),
    flags_comments            = list(),
    previous_string_comments  = list()
  )
}

empty_countable_msgs <- function() {
  tibble(
    msgid                     = character(),
    msgid_plural              = character(),
    msgstr                    = list(),
    is_obsolete               = logical(),
    msgctxt                   = list(),
    translator_comments       = list(),
    source_reference_comments = list(),
    flags_comments            = list(),
    previous_string_comments  = list()
  )
}

#' Read PO and POT files
#'
#' Reads .PO and .POT translation files.
#' @param po_file A string giving a path to a PO file.
#' @return An object of class \code{\link{po}}. The \code{source_type} and
#' \code{file_type} elements are automatically determined from the file
#' name.
#' @seealso \code{\link[tools]{xgettext}}
#' @examples
#' # read_po is used for both po and pot files
#' pot_file <- system.file("extdata/R-summerof69.pot", package = "poio")
#' (pot <- read_po(pot_file))
#' @importFrom assertive.properties is_empty
#' @importFrom assertive.properties is_non_empty
#' @importFrom assertive.properties is_scalar
#' @importFrom assertive.types assert_is_a_string
#' @importFrom assertive.files assert_all_are_existing_files
#' @importFrom dplyr filter
#' @importFrom dplyr select
#' @importFrom dplyr bind_cols
#' @importFrom dplyr bind_rows
#' @importFrom magrittr %>%
#' @importFrom magrittr extract
#' @importFrom magrittr extract2
#' @importFrom purrr map
#' @importFrom purrr map_df
#' @importFrom purrr map_lgl
#' @importFrom purrr keep
#' @importFrom stringi stri_read_lines
#' @importFrom stringi stri_detect_regex
#' @importFrom stringi stri_match_first_regex
#' @importFrom tibble tibble
#' @importFrom tools file_ext
#' @export
read_po <- function(po_file)
{
  assert_is_a_string(po_file)
  assert_all_are_existing_files(po_file)
  warn_if_not_po_file(po_file)
  lines <- stri_read_lines(po_file, "UTF-8")

  base_file_name <- basename(po_file)
  source_type <- ifelse(
    substring(base_file_name, 1, 2) == "R-", "r", "c"
  )
  file_type <- file_ext(base_file_name)

  metadata_line_index <- which(stri_detect_regex(lines, RX$metadata))
  metadata_lines <- lines[metadata_line_index]
  metadata <- extract_metadata(metadata_lines)

  # Translator comments before metadata
  first_metadata_line <- metadata_line_index[1]
  lines_before_metadata <- lines[seq_len(first_metadata_line - 1L)]
  initial_comments <- match_and_extract(
    lines_before_metadata,
    RX$translator_comment
  )

  # How many plural forms are there?
  n_plural_forms <- get_n_plural_forms(metadata)

  # Now consider only lines after metadata
  last_metadata_line <- metadata_line_index[length(metadata_line_index)]
  lines <- lines[-seq_len(last_metadata_line)]

  # Split on blank lines
  line_groups <- split_into_line_groups(lines)

  # Special handling of empty case need because lapply
  # just returns list() otherwise.
  if(is_empty(line_groups))
  {
    msgs_direct <- empty_direct_msgs()
    msgs_countable <- empty_countable_msgs()
  } else # lines groups is not empty
  {
    all_msgs <- lapply(line_groups, parse_line_group, n_plural_forms = n_plural_forms)
    is_direct <- all_msgs %>%
      map_lgl(~ .$msg_type == "direct")
    msgs_direct <- if(any(is_direct)) {
      all_msgs[is_direct] %>%
        map_df(~ .$msgs)
    } else {
      empty_direct_msgs()
    }
    msgs_countable <- if(any(!is_direct)) {
      all_msgs[!is_direct] %>%
        map_df(~ .$msgs)
    } else {
      empty_countable_msgs()
    }
  }

  po(
    source_type = source_type,
    file_type = file_type,
    initial_comments = initial_comments,
    metadata = metadata,
    direct = msgs_direct,
    countable = msgs_countable
  )
}

is_po_file <- function(po_file)
{
  stri_detect_regex(po_file, "\\.pot?$")
}

warn_if_not_po_file <- function(po_file)
{
  if(!is_po_file(po_file)) {
    warning("The file extension is neither 'po' nor 'pot'.")
  }
}

#' @importFrom stringi stri_match_first_regex
#' @importFrom tibble tibble
extract_metadata <- function(metadata_lines) {
  metadata <- metadata_lines %>%
    stri_match_first_regex(RX$metadata)
  tibble(
    name  = metadata[, 2],
    value = metadata[, 3]
  )
}

#' @importFrom assertive.properties is_non_empty
#' @importFrom purrr map
#' @importFrom purrr keep
#' @importFrom stringi stri_detect_regex
split_into_line_groups <- function(lines) {
  # Split on blank lines
  blank_lines <- cumsum(stri_detect_regex(lines, RX$blank))
  line_groups <- split(lines, blank_lines)

  line_groups %>%
    # Remove those empty lines
    map(~ .[!stri_detect_regex(., RX$blank)]) %>%
    # Remove empty groups
    keep(is_non_empty)
}

#' @importFrom assertive.properties is_empty
#' @importFrom tibble tibble
parse_line_group <- function(lines, n_plural_forms)
{
  msgid_match <- match_and_extract(lines, RX$msgid, drop = FALSE)
  # Where there is an obsolete comment, no msgids are possible
  # Currently the implementation allows for zero msgids under all
  # circumstances which is possibly too lenient, but the spec is unclear.
  if(nrow(msgid_match) != 1L)
  {
    stop(
      "There should be exactly one msgid found in each block. Are you missing a blank line?\nThe offending lines are:\n",
      paste(lines, collapse = "\n")
    )
  }
  is_obsolete <- !is.na(msgid_match[, 1L])
  msgid <- msgid_match[, 2L]

  comments <- tibble(
    translator_comments = list(
      match_and_extract(lines, RX$translator_comment)
    ),
    source_reference_comments = list(
      match_and_extract(lines, RX$source_reference_comment)
    ),
    flags_comments = list(
      match_and_extract(lines, RX$flags_comment)
    ),
    previous_string_comments = list(
      match_and_extract(lines, RX$previous_string_comment)
    )
  )
  msgctxt <- list(match_and_extract(lines, RX$msgctxt))
  msgid_plural <- match_and_extract(lines, RX$msgid_plural)
  if(is_empty(msgid_plural)) # direct msg
  {
    msgstr <- match_and_extract(lines, RX$msgstr_direct)
    list(
      msg_type = "direct",
      msgs = tibble(
        msgid       = msgid,
        msgstr      = msgstr,
        is_obsolete = is_obsolete,
        msgctxt     = msgctxt
      ) %>%
        bind_cols(comments)
    )
  } else # countable msg
  {
    msgstr_and_id <- match_and_extract(lines, RX$msgstr_countable)
    # In case of weird file with, e.g, msgstr[1] before msgstr[0], reorder
    o <- order(msgstr_and_id[, 1L])
    msgstr <- msgstr_and_id[o, 2L]
    # If badly formed with wrong number of plurals, add empty translations
    # up to no. of plural forms suggested by metadata (or ignore extras).
    length(msgstr) <- n_plural_forms
    msgstr <- list(msgstr)
    list(
      msg_type = "countable",
      msgs = tibble(
        msgid        = msgid,
        msgid_plural = msgid_plural,
        msgstr       = msgstr,
        is_obsolete  = is_obsolete,
        msgctxt      = msgctxt
      ) %>%
        bind_cols(comments)
    )
  }
}

Try the poio package in your browser

Any scripts or data that you put into this service are public.

poio documentation built on April 19, 2020, 4:16 p.m.