Nothing
#' Main R Interface to 'Apache Tika'
#'
#' Extract text or metadata from over a thousand file types.
#' Get either plain text or structured \code{XHTML}.
#' Metadata includes \code{Content-Type}, character encoding, and Exif data from
#' jpeg or tiff images. See the long list of supported file types,
#' click the "Supported Formats" link on this page :
#' \url{https://tika.apache.org/}.
#'
#' @param input Character vector describing the paths to the input documents.
#' Strings starting with 'http://','https://', or 'ftp://' are downloaded to a
#' temporary directory. On Windows, the local paths cannot span
#' drives because of a Windows convention.
#' @param output Optional character vector of the output format. The default,
#' \code{"text"}, gets plain text without metadata. \code{"xml"} and
#' \code{"html"} get \code{XHTML} text with metadata. \code{"jsonRecursive"}
#' gets \code{XHTML} text and \code{json} metadata.
#' \code{c("jsonRecursive","text")} or \code{c("J","t")} get plain text and
#' \code{json} metadata. See the 'Output Details' section.
#' @param output_dir Optional directory path to save the converted files in.
#' Tika may overwrite files so an empty directory is best. See the 'Output
#' Details' section before using.
#' @param return Logical if an R object should be returned. Defaults to
#' TRUE. If set to FALSE, and output_dir (above) must be specified.
#' @param java Optional command to invoke Java. For example, it can be the full
#' path to a particular Java version. See the Configuration section below.
#' @param jar Optional alternative path to a \code{tika-app-X.XX.jar}. Useful
#' if this package becomes out of date.
#' @param threads Integer of the number of file consumer threads Tika uses.
#' Defaults to 2.
#' @param max_restarts Integer of the maximum number of times the watchdog
#' process will restart the child process. The default is no limit.
#' @param timeout Integer of the number of milliseconds allowed to a parse
#' before the process is killed and restarted. Defaults to 300000.
#' @param max_file_size Integer of the maximum bytes allowed.
#' Do not process files larger than this. The default is unlimited.
#' @param config Path to the XML config file. Defaults to \code{system.file("extdata", "ocr.xml", package = "rtika")}'. There is also a \code{no-ocr.xml} file available.
#' @param args Optional character vector of additional arguments passed to Tika,
#' that may not yet be implemented in this R interface, in the pattern of
#' \code{c('-arg1','setting1','-arg2','setting2')}.
#' @param quiet Logical if Tika command line messages and errors are to be
#' suppressed. Defaults to \code{TRUE}.
#' @param cleanup Logical to clean up temporary files after running the command,
#' which can accumulate. Defaults to \code{TRUE}. They are in \code{tempdir()}. These
#' files are automatically removed at the end of the R session even if set to
#' FALSE.
#' @param lib.loc Optional character vector describing the library paths.
#' Normally, it's best to
#' leave this parameter alone. The parameter is included
#' mainly for package testing.
#' @return A character vector in the same order and with the same length as
#' \code{input}. Unprocessed files are \code{as.character(NA)}.
#' If \code{return = FALSE}, then a \code{NULL} value is invisibly returned.
#' See the Output Details section below.
#' @examples
#' \donttest{
#' #extract text
#' batch <- c(
#' system.file("extdata", "jsonlite.pdf", package = "rtika"),
#' system.file("extdata", "curl.pdf", package = "rtika"),
#' system.file("extdata", "table.docx", package = "rtika"),
#' system.file("extdata", "xml2.pdf", package = "rtika"),
#' system.file("extdata", "R-FAQ.html", package = "rtika"),
#' system.file("extdata", "calculator.jpg", package = "rtika"),
#' system.file("extdata", "tika.apache.org.zip", package = "rtika")
#' )
#' text = tika(batch)
#' cat(substr(text[1],45,450))
#'
#' #more complex metadata
#' if(requireNamespace('jsonlite')){
#'
#' json = tika(batch,c('J','t'))
#' # 'J' is shortcut for jsonRecursive
#' # 't' for text
#' metadata = lapply(json, jsonlite::fromJSON )
#'
#' #embedded resources
#' lapply(metadata, function(x){ as.character(x$'Content-Type') })
#'
#' lapply(metadata, function(x){ as.character(x$'Creation-Date') })
#'
#' lapply(metadata, function(x){ as.character(x$'X-TIKA:embedded_resource_path') })
#' }
#' }
#' @section Output Details:
#' If an input file did not exist, could not be downloaded, was a directory, or
#' Tika could not process it, the result will be \code{as.character(NA)} for
#' that file.
#'
#' By default, \code{output = "text"} and this produces plain text with no
#' metadata. Some formatting is preserved in this case using tabs, newlines and
#' spaces.
#'
#' Setting \code{output} to either \code{"xml"} or the shortcut \code{"x"} will
#' produce a strict form of \code{HTML} known as \code{XHTML}, with metadata in
#' the \code{head} node and formatted text in the \code{body}.
#' Content retains more formatting with \code{"xml"}. For example, a Word or
#' Excel table will become a HTML \code{table}, with table data as text in
#' \code{td} elements. The \code{"html"} option and its shortcut \code{"h"}
#' seem to produce the same result as \code{"xml"}.
#' Parse XHTML output with \code{xml2::read_html}.
#'
#' Setting \code{output} to \code{"jsonRecursive"} or its shortcut \code{"J"}
#' produces a tree structure in `json`. Metadata fields are at the top level.
#' The \code{XHTML} or plain text will be found in the \code{X-TIKA:content}
#' field. By default the text is \code{XHTML}. This can be changed to plain
#' text like this: \code{output=c("jsonRecursive","text")} or
#' \code{output=c("J","t")}. This syntax is meant to mirror Tika's. Parse
#' \code{json} with \code{jsonlite::fromJSON}.
#'
#' If \code{output_dir} is specified, then the converted files will also be
#' saved to this directory. It's best to use an empty directory because Tika
#' may overwrite existing files. Tika seems to add an extra file extension to
#' each file to reduce the chance, but it's still best to use an empty
#' directory. The file locations within the \code{output_dir} maintain the same
#' general path structure as the input files. Downloaded files have a path
#' similar to the `tempdir()` that R uses. The original paths are now relative
#' to \code{output_dir}. Files are appended with \code{.txt} for the default
#' plain text, but can be \code{.json}, \code{.xml}, or \code{.html} depending
#' on the \code{output} setting. One way to get a list of the processed files
#' is to use \code{list.files} with \code{recursive=TRUE}.
#' If \code{output_dir} is not specified, files are saved to a volatile temp
#' directory named by \code{tempdir()} and will be deleted when R shuts down.
#' If this function will be run on very large batches repeatedly, these
#' temporary files can be cleaned up every time by adding
#' \code{cleanup=TRUE}.
#' @section Background:
#' Tika is a foundational library for several Apache projects such as the Apache
#' Solr search engine. It has been in development since at least 2007. The most
#' efficient way I've found to process many thousands of documents is Tika's
#' 'batch' mode, which is the only mode used in `rtika`. There are potentially
#' more things that can be done, given enough time and attention, because
#' Apache Tika includes many libraries and methods in its .jar file. The source is available at:
#' \url{https://tika.apache.org/}.
#' @section Installation:
#' Tika requires Java 8.
#'
#' Java installation instructions are at https://openjdk.org/install/
#' or https://www.java.com/en/download/help/download_options.xml.
#'
#' By default, this R package internally invokes Java by calling the \code{java}
#' command from the command line. To specify the path to a particular Java
#' version, set the path in the \code{java} attribute of the \code{tika}
#' function.
#'
#'
#' @export
tika <- function(input,
output = c("text", "jsonRecursive", "xml", "html")[1],
output_dir = "",
return = TRUE,
java = rtika::java(),
jar = rtika::tika_jar(),
threads = 2,
max_restarts = integer(),
timeout = 300000,
max_file_size = integer(),
config = system.file("extdata", "ocr.xml", package = "rtika"),
args = character(),
quiet = TRUE,
cleanup = TRUE,
lib.loc = .libPaths()) {
# Special thanks to Hadley the git tutorial at:
# http://r-pkgs.had.co.nz/git.html
# To update tika version, update
# (1) DESCRIPTION of the Version
# (2) R/zzz.R file variable: "tika_jar_tested_version"
# (3) R/install_tika "version" and sha512 "digest" of tika-app at https://tika.apache.org/download.html
# (4) NEWS.md to the verision
# When updating the package, run these functions in order:
# make sure required packages are installed including: devtools, knitr,rmarkdown, pkgdown
# devtools::document() # sets up NAMESPACE and .Rd documentation files to match function
# Use Rstudio's "Build" > "Install and Restart"
# if necessary, run "rtika::install_tika()" to update the .jar
# devtools::test();
# devtools::build()
# devtools::build_vignettes()
# Use Rstudio's "Build" > "Install and Restart"
# Sys.setenv(NOT_CRAN = TRUE);
# pkgdown::clean_site() ; pkgdown::build_site() # https://www.r-bloggers.com/building-a-website-with-pkgdown-a-short-guide/
# Rstudio "Build" > "More" > "Build Source Package"
# upload to github for tests!!
# upload to CRAN
# Suggested functions to run occasionally:
# goodpractice::gp()
#
# styler::style_dir() # note this has made some files break in the past,
# TODO: memory setting with java -Xmx1024m -jar.
# Probably also adjust child process -JXmx4g
# Parameter sanity check --------------------------------------------
stopifnot(
class(input) == "character",
length(input) > 0,
class(output) == "character",
length(output) > 0,
length(output) <= 2,
class(output_dir) == "character",
length(output_dir) == 1,
class(return) == "logical",
length(return) == 1,
class(java) == "character",
length(java) == 1,
!any(is.na(jar)),
class(jar) == "character",
length(jar) == 1,
nchar(jar) > 0,
class(threads) %in% c("integer", "numeric"),
length(threads) <= 1,
class(max_restarts) %in% c("integer", "numeric"),
length(max_restarts) <= 1,
class(timeout) %in% c("integer", "numeric"),
length(timeout) <= 1,
class(max_file_size) %in% c("integer", "numeric"),
length(max_file_size) <= 1,
class(config) == 'character',
class(args) == "character",
class(quiet) == "logical",
class(cleanup) == "logical",
class(lib.loc) == "character",
ifelse(nchar(output_dir) == 0, return == TRUE, TRUE)
)
# TODO: consider a config file
# for fine grained control over parsers.
# see: https://tika.apache.org/1.19/configuring.html
# but waiting for batch format to stabilize.
# Define return variable structure -----------------------------------
# output will be character vector the same length as input,
# with initial NAs ...
out <- character(length(input))
failure <- as.character(NA)
out[out == ""] <- failure
# Parameter tidying ------------------------------------------------
# output_flag is output format for Tika command line
output_flag <- character()
output_flag <- ifelse(any(output %in% c("jsonRecursive", "J", "-J")),
"-J",
output_flag
) # goes first
output_flag <- c(output_flag, ifelse(any(output %in% c("text", "t", "-t")),
"-t",
NA
))
output_flag <- c(output_flag, ifelse(any(output %in% c("xml", "x", "-x")),
"-x",
NA
))
output_flag <- c(output_flag, ifelse(any(output %in% c("html", "h", "-h")),
"-h",
NA
))
output_flag <- as.character(stats::na.omit(output_flag))
# output_dir parameter stores tika's processed files.
# If it doesn't exist, create one in the temp directory
if (output_dir == "") {
# The filenames are guaranteed not to be currently in use.
output_dir <- tempfile("rtika_dir")
dir.create(output_dir)
output_dir <- normalizePath(output_dir, mustWork = TRUE, winslash = "/")
} else {
# if an output directory is provided, check it exists.
output_dir <- normalizePath(output_dir, mustWork = TRUE, winslash = "/")
# Must be very careful writing to any directory outside of temp directory
# Idea here it to check its not the root directory
if (output_dir == normalizePath("/", winslash = "/")) {
stop("Output directory should not be the same as the system root.")
}
}
# input parameter may contain URLs. Download if needed
toDownload <- grep(
"^(http[s]?:/|ftp:/|file:/)",
input,
ignore.case = TRUE
)
if (length(toDownload) > 0) {
# input parameter adds downloaded file paths (if not downloaded, rtika_download produces NAs)
urls <- input[toDownload]
tempfiles <- tika_fetch(urls)
input[toDownload] <- tempfiles
}
# Input can have issues.
# Check if file exist, are not directories, are not NA, and were downloaded
file_exists <- !is.na(input) & file.exists(input) & !dir.exists(input)
if (!any(file_exists)) {
warning("No files could be found.")
return(out)
}
# inputFiles and fileList will contain files for Tika to process --------------------------
inputFiles <- normalizePath(input[file_exists], winslash = "/")
# Tika expects files to be relative to root
# NB: These paths will be READ, NOT written to!
# Original way to get root path:
# root <- normalizePath("/", winslash = "/")
# on Windows, files cannot span drives. Tika expects relative paths, and Windows does not
# allow relative paths to span drives!
root <- regmatches(inputFiles[1],regexpr("^([A-Za-z:]*/)",inputFiles[1]))[1]
# remove proceeding drive letter on Windows, proceeding forward slash on *nix and OS X.
inputFiles <- sub(root, "", inputFiles, fixed = TRUE)
# fileList is a delimited version if inputFiles that will be passed to Tika.
# File paths containing both commas and quote characters appear to work.
fileList <- normalizePath(tempfile("rtika_file"), mustWork = FALSE, winslash = "/")
utils::write.table(
inputFiles,
fileList, row.names = FALSE,
col.names = FALSE,
sep = ",",
quote = FALSE
)
# After the file is created, make sure it exists
if (!file.exists(fileList)) {
stop('Could not write to the tempfile for "fileList": ', fileList)
}
# java call -----------------------------------------------------
numConsumers <- character()
maxRestarts <- character()
timeoutThresholdMillis <- character()
maxFileSizeBytes <- character()
configTika <- character()
if (length(threads) > 0) {
numConsumers <- c("-numConsumers", as.character(as.integer(threads)))
}
if (length(max_restarts) > 0) {
maxRestarts <- c("-maxRestarts", as.character(as.integer(max_restarts)))
}
if (length(timeout) > 0) {
timeoutThresholdMillis <- c("-timeoutThresholdMillis", as.character(as.integer(timeout)))
}
if (length(maxFileSizeBytes) > 0) {
maxFileSizeBytes <- c("-maxFileSizeBytes", as.character(as.integer(max_file_size)))
}
if (length(config) > 0 && nchar(config[1]) > 0) {
configTika <- c("-c", config[1])
}
java_args <- c(
"-Djava.awt.headless=true",
"-jar", jar,
numConsumers,
maxRestarts,
timeoutThresholdMillis,
maxFileSizeBytes,
configTika,
args, output_flag, "-i", root,
"-o", output_dir,
"-fileList", fileList
)
sys::exec_wait(
cmd = java[1], args = java_args, std_out = !quiet,
std_err = !quiet, std_in = FALSE
)
# retrieve results --------------------------------------------------------
if (return) {
output_file_affix <- character()
output_file_affix <- ifelse(any(output %in% c("text", "t", "-t")),
".txt",
output_file_affix
)
output_file_affix <- ifelse(any(output %in% c("xml", "x", "-x")),
".xml",
output_file_affix
)
output_file_affix <- ifelse(any(output %in% c("html", "h", "-h")),
".html",
output_file_affix
)
output_file_affix <- ifelse(any(output %in% c("jsonRecursive", "J", "-J")),
".json",
output_file_affix
)
# Vectorized readChar
.rtika_readFile <- function(path) {
bytes <- file.size(path)
ifelse(!is.na(bytes), mapply(
readChar,
con = path,
nchars = bytes,
useBytes = TRUE
),
as.character(NA)
)
}
# Clean & check with normalizePath, with warnings if not processed.
output_files <- normalizePath(
file.path(
output_dir,
paste0(
inputFiles,
output_file_affix
)
),
winslash = "/"
)
out[file_exists] <- .rtika_readFile(output_files)
# From studying the source code, the Tika batch processor defaults to UTF-8
# every batch config xml file uses a FSOutputStreamFactory set to UTF-8
# cleanup temp files -----------------------------------------------------
if (cleanup) {
trash_file <- file.path(normalizePath(tempdir()), list.files(
tempdir(),
pattern = "^rtika_file"
))
if (length(trash_file) > 0) {
tmp <- file.remove(trash_file)
}
trash_dir <- file.path(normalizePath(tempdir()), list.files(
tempdir(),
pattern = "^rtika_dir"
))
if (length(trash_dir) > 0) {
# if(.Platform$OS.type=='windows'){
# On windows, deleting dir does not seem to work unless force=TRUE.
# but do not want to force this.
# tmp= unlink( trash_dir, recursive =TRUE, force=TRUE)
# } else {
tmp <- unlink(trash_dir, recursive = TRUE)
# }
}
}
return(out)
} else {
# if return==FALSE
invisible(NULL)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.