Nothing
## ----setup, include = FALSE---------------------------------------------------
# only evaluate code if "NOT_CRAN"
NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true")
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
if(NOT_CRAN){
if(is.na(rtika::tika_jar())){ rtika::install_tika() }
}
## ---- eval=NOT_CRAN-----------------------------------------------------------
library('rtika')
library('magrittr')
# Code to get ALL the files in my_path:
# my_path <- "~"
# batch <- file.path(my_path,
# list.files(path = my_path,
# recursive = TRUE))
# pipe the batch into tika_text()
# to get plain text
# test files
batch <- c(
system.file("extdata", "jsonlite.pdf", package = "rtika"),
system.file("extdata", "curl.pdf", package = "rtika"),
system.file("extdata", "table.docx", package = "rtika"),
system.file("extdata", "xml2.pdf", package = "rtika"),
system.file("extdata", "R-FAQ.html", package = "rtika"),
system.file("extdata", "calculator.jpg", package = "rtika"),
system.file("extdata", "tika.apache.org.zip", package = "rtika")
)
text <-
batch %>%
tika_text()
# normal syntax also works:
# text <- tika_text(batch)
## ---- eval=NOT_CRAN-----------------------------------------------------------
# Find which files had an issue
# Handle them if needed
batch[which(is.na(text))]
## ---- eval=NOT_CRAN-----------------------------------------------------------
length(text)
search <-
text[grep(pattern = ' is ', x = text)]
length(search)
## ---- eval=NOT_CRAN-----------------------------------------------------------
download_directory <- tempfile('rtika_')
dir.create(download_directory)
urls <- c('https://tika.apache.org/',
'https://cran.rstudio.com/web/packages/keras/keras.pdf')
downloaded <-
urls %>%
tika_fetch(download_directory)
# it will add the appropriate file extension to the downloads
downloaded
## ---- eval=NOT_CRAN-----------------------------------------------------------
# create a directory not already in use.
my_directory <-
tempfile('rtika_')
dir.create(my_directory)
# pipe the batch to tika_text()
batch %>%
tika_text(threads = 4,
return = FALSE,
output_dir = my_directory)
# list all the file locations
processed_files <- file.path(
normalizePath(my_directory),
list.files(path = my_directory,
recursive = TRUE)
)
## ---- eval=NOT_CRAN-----------------------------------------------------------
processed_files
## ---- eval=NOT_CRAN-----------------------------------------------------------
library('xml2')
# get XHTML text
html <-
batch %>%
tika_html() %>%
lapply(xml2::read_html)
# parse links from documents
links <-
html %>%
lapply(xml2::xml_find_all, '//a') %>%
lapply(xml2::xml_attr, 'href')
sample(links[[1]],10)
## ---- eval=NOT_CRAN-----------------------------------------------------------
# Content-Type
html %>%
lapply(xml2::xml_find_first, '//meta[@name="Content-Type"]') %>%
lapply(xml2::xml_attr, 'content') %>%
unlist()
# Creation-Date
html %>%
lapply(xml2::xml_find_first, '//meta[@name="Creation-Date"]') %>%
lapply(xml2::xml_attr, 'content') %>%
unlist()
## ---- eval=NOT_CRAN-----------------------------------------------------------
library('jsonlite')
# batch <- system.file("extdata", "calculator.jpg", package = "rtika")
# a list of data.frames
metadata <-
batch %>%
tika_json() %>%
lapply(jsonlite::fromJSON)
# look at metadata for an image
str(metadata[[6]])
## ---- eval=NOT_CRAN-----------------------------------------------------------
metadata[[6]]$'geo:lat'
metadata[[6]]$'geo:long'
## ---- eval=NOT_CRAN-----------------------------------------------------------
# wget gets a webpage and other files.
# sys::exec_wait('wget', c('--page-requisites', 'https://tika.apache.org/'))
# Put it all into a .zip file
# sys::exec_wait('zip', c('-r', 'tika.apache.org.zip' ,'tika.apache.org'))
batch <- system.file("extdata", "tika.apache.org.zip", package = "rtika")
# a list of data.frames
metadata <-
batch %>%
tika_json() %>%
lapply(jsonlite::fromJSON)
# The structure is very long. See it on your own with: str(metadata)
## ---- eval=NOT_CRAN-----------------------------------------------------------
# the 'X-TIKA:embedded_resource_path' field
embedded_resource_path <-
metadata %>%
lapply(function(x){ x$'X-TIKA:embedded_resource_path' })
embedded_resource_path
## ---- eval=NOT_CRAN-----------------------------------------------------------
content_type <-
metadata %>%
lapply(function(x){ x$'Content-Type' })
content_type
## ---- eval=NOT_CRAN-----------------------------------------------------------
content <-
metadata %>%
lapply(function(x){ x$'X-TIKA:content' })
str(content)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.