#' Extract Text from RTF (Rich Text Format) File
#' @description Parses an RTF file and extracts plain text as character vector.
#' @param file Path to an RTF file. Must be character of length 1.
#' @param verbose Logical. If TRUE, progress report is printed on console.
#' While it can be informative when parsing a large file,
#' this option itself makes the process slow.
#' @param row_start,row_end strings to be added at the beginning and end of table rows
#' @param cell_end string to be put at the end of table cells
#' @param ignore_tables if \code{TRUE}, no special treatment for tables
#' @param check_file if \code{TRUE}, conducts a quick check on the file if it is an RTF file.
#' If the file fails to pass the check, returns NULL without parsing the file.
#' @param ... Addional arguments passed to \code{\link{readLines}}
#' @return Character vector of extracted text
#' @export
#' @examples read_rtf(system.file("extdata/king.rtf", package = "striprtf"))
#' @references
#' \itemize{
# \item{Python 3 implementation by Gilson Filho: \url{https://gist.github.com/gilsondev/7c1d2d753ddb522e7bc22511cfb08676}}
#' \item{Original discussion thread: \url{https://stackoverflow.com/a/188877}}
#' \item{Code page table: \url{https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/}}
#' }
#' @details Rich text format (RTF) files are written as a text file
#' consisting of ASCII characters. The specification has been developed by
#' Microsoft.
#' This function interprets the character strings and extracts plain texts of
#' the file.
#' Major part of the algorithm of this function comes from a
#' stack overflow thread (\url{https://stackoverflow.com/a/188877}) and
#' the references therein.
#' This function is a translation of the above to R language, associated with C++ codes for enhancement.
#'
#' An advance from the preceding implementation is that the function
#' accomodates with various ANSI code pages. For example, RTF files created
#' by Japanese version of Microsoft Word marks \code{\\ansicpg932}, which indicates
#' the code page 932 is used for letter-code conversion.
#' The function detects the code page indication and
#' convert the characters to UTF-8 where possible. Conversion tables are retrieved from
#' here: (\url{https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/}).
#'
read_rtf <- function(file, verbose = FALSE,
row_start = "*| ", row_end = "", cell_end = " | ", ignore_tables=FALSE,
check_file=TRUE,
...)
{
stopifnot(is.character(file))
stopifnot(length(file) == 1L)
if (check_file && !looks_rtf(file)) {
message(sprintf("File '%s' does not seem to be an RTF file.", file))
return(NULL)
}
readLines(file, warn = FALSE, ...) %>%
paste0(collapse = "\n") %>%
strip_rtf(verbose, row_start, row_end, cell_end, ignore_tables)
}
#' @rdname read_rtf
#' @param text Character of length 1. Expected to be contents of an RTF file.
#' @export
strip_rtf <- function(text, verbose = FALSE,
row_start = "*| ", row_end = "", cell_end = " | ", ignore_tables=FALSE)
{
stopifnot(is.character(text))
stopifnot(is.logical(verbose))
stopifnot(length(verbose) > 0)
if (length(verbose) > 1) {
verbose <- verbose[1]
warning("only first element of verbose is used")
}
# if text has more than 1 length, collapse
if (length(text) > 1) text <- paste0(text, collapse = "\n")
# obtain code page
cp <- stringr::str_match(text, "\\\\ansicpg([0-9]+)")[,2]
if (is.na(cp)) {
cpname <- NA_character_
code_before <- integer(0)
code_after <- integer(0)
} else {
cpname <- paste("CP", cp, sep = "")
code_before <- .cptable[[cpname]]$before
code_after <- .cptable[[cpname]]$after
}
pattern <- stringr::regex(
"\\\\([a-z]{1,32})(-?\\d{1,10})?[ ]?|\\\\'([0-9a-f]{2})|\\\\([^a-z])|([{}])|[\r\n]+|(.)",
ignore_case = TRUE)
match_mat <- stringr::str_match_all(text, pattern)[[1]]
if (nrow(match_mat) == 0) return(character(0))
# str_match_all now returns NAs for unmatched optional groups
# I replace NAs with "" so that the c++ function works as before
match_mat[is.na(match_mat)] <- ""
tmp_rep <- unused_letters(c(text, row_start, row_end, cell_end),
2, as_number=TRUE)
tmp_rep_str <- intToUtf8(tmp_rep) %>% strsplit("") %>% unlist()
keys <- .specialchars$keys
hexstr <- .specialchars$hexstr
hexstr[keys=="row"] <- sprintf("x%04d", tmp_rep[1])
hexstr[keys=="cell"] <- sprintf("x%04d", tmp_rep[2])
# use c++ helper function to parse
parsed <- strip_helper(match_mat,
dest_names = .destinations,
special_keys = keys,
special_hex = hexstr,
code_before = code_before,
code_after = code_after,
verbose = verbose)
#print(parsed)
# convert integer codes to utf8
out <- lapply(parsed$intcode, intToUtf8) %>% unlist()
#print(out)
# code page translation ... will be done in the strip_helper function
# if (!is.na(cpname)) {
# if (cpname %in% names(.cptable)) {
# out[parsed$toconv] <- chartr(.cptable[[cpname]]$before,
# .cptable[[cpname]]$after,
# out[parsed$toconv])
# #out[parsed$toconv] <- lapply(out[parsed$toconv], function(a) {
# # chartr(.cptable[[cpname]]$before, .cptable[[cpname]]$after, a)
# #})
# } else {
# warning("conversion table for ", cpname, " is missing")
# }
# }
# if there is no table or ignore table option is specified,
# remove tmp_rep characters,
# split by line breaks,
# and return
if (!any(parsed$table) || ignore_tables) {
out <- out %>% paste0(collapse = "") %>%
strsplit("\n") %>%
unlist()
regx <- sprintf("[%s]", paste0(tmp_rep_str, collapse = ""))
out <- gsub(regx, "", out)
return(out)
}
#print(out)
# non-table sections are split by line breaks
# table sections are split by \row
# so we replace line breaks
out[!parsed$table] <- stringr::str_replace_all(out[!parsed$table], "\n", tmp_rep_str[1])
out <- strsplit(out, tmp_rep_str[1])
# unlist, with table flag kept tracked
len <- lapply(out, length) %>% unlist()
out <- unlist(out)
table_flg <- Map(rep, parsed$table, len) %>% unlist()
#print(out)
#print(table_flg)
# remove empty table sections
emp_tbl <- (nchar(out) == 0) & table_flg
out <- out[!emp_tbl]
table_flg <- table_flg[!emp_tbl]
# cell separators are replaced
out[table_flg] <- stringr::str_replace_all(out[table_flg], tmp_rep_str[2], cell_end)
# row start and end indicators added
out[table_flg] <- paste(row_start, out[table_flg], row_end, sep = "")
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.