inst/doc/rtika_introduction.R

## ----setup, include = FALSE---------------------------------------------------
# only evaluate code if "NOT_CRAN"
NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true")

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

if(NOT_CRAN){
  if(is.na(rtika::tika_jar())){ rtika::install_tika() }  
}


## ---- eval=NOT_CRAN-----------------------------------------------------------

library('rtika')
library('magrittr')

# Code to get ALL the files in my_path:

# my_path <- "~"
# batch <- file.path(my_path,
#                 list.files(path = my_path,
#                 recursive = TRUE))

# pipe the batch into tika_text() 
# to get plain text

# test files
batch <- c(
  system.file("extdata", "jsonlite.pdf", package = "rtika"),
  system.file("extdata", "curl.pdf", package = "rtika"),
  system.file("extdata", "table.docx", package = "rtika"),
  system.file("extdata", "xml2.pdf", package = "rtika"),
  system.file("extdata", "R-FAQ.html", package = "rtika"),
  system.file("extdata", "calculator.jpg", package = "rtika"),
  system.file("extdata", "tika.apache.org.zip", package = "rtika")
)

text <-  
    batch %>%
    tika_text() 

# normal syntax also works:
# text <- tika_text(batch)


## ---- eval=NOT_CRAN-----------------------------------------------------------
# Find which files had an issue
# Handle them if needed
batch[which(is.na(text))]

## ---- eval=NOT_CRAN-----------------------------------------------------------
length(text)

search <-
    text[grep(pattern = ' is ', x = text)]

length(search)

## ---- eval=NOT_CRAN-----------------------------------------------------------
download_directory <- tempfile('rtika_')

dir.create(download_directory)

urls <- c('https://tika.apache.org/',
          'https://cran.rstudio.com/web/packages/keras/keras.pdf')

downloaded <- 
    urls %>% 
    tika_fetch(download_directory)

# it will add the appropriate file extension to the downloads
downloaded


## ---- eval=NOT_CRAN-----------------------------------------------------------
# create a directory not already in use.
my_directory <-
   tempfile('rtika_')
                  
dir.create(my_directory)

# pipe the batch to tika_text()
batch %>%
tika_text(threads = 4,
          return = FALSE,
          output_dir = my_directory) 

# list all the file locations 
processed_files <- file.path(
                normalizePath(my_directory),
                list.files(path = my_directory,
                recursive = TRUE)
                )


## ---- eval=NOT_CRAN-----------------------------------------------------------
processed_files

## ---- eval=NOT_CRAN-----------------------------------------------------------
library('xml2')

# get XHTML text
html <- 
    batch %>%
    tika_html() %>%
    lapply(xml2::read_html)

# parse links from documents
links <-
    html %>%
    lapply(xml2::xml_find_all, '//a') %>%
    lapply(xml2::xml_attr, 'href')

sample(links[[1]],10)

## ---- eval=NOT_CRAN-----------------------------------------------------------
# Content-Type
html %>%
lapply(xml2::xml_find_first, '//meta[@name="Content-Type"]') %>%
lapply(xml2::xml_attr, 'content') %>%
unlist()

# Creation-Date
html %>%
lapply(xml2::xml_find_first, '//meta[@name="Creation-Date"]') %>%
lapply(xml2::xml_attr, 'content') %>%
unlist()


## ---- eval=NOT_CRAN-----------------------------------------------------------
library('jsonlite')
# batch <- system.file("extdata", "calculator.jpg", package = "rtika")

# a list of data.frames
metadata <-
    batch %>% 
    tika_json() %>%
    lapply(jsonlite::fromJSON)

# look at metadata for an image
str(metadata[[6]])


## ---- eval=NOT_CRAN-----------------------------------------------------------
metadata[[6]]$'geo:lat'
metadata[[6]]$'geo:long'

## ---- eval=NOT_CRAN-----------------------------------------------------------
# wget gets a webpage and other files. 
# sys::exec_wait('wget', c('--page-requisites', 'https://tika.apache.org/'))
# Put it all into a .zip file 
# sys::exec_wait('zip', c('-r', 'tika.apache.org.zip' ,'tika.apache.org'))
batch <- system.file("extdata", "tika.apache.org.zip", package = "rtika")

# a list of data.frames
metadata <-
    batch %>% 
    tika_json() %>%
    lapply(jsonlite::fromJSON)

# The structure is very long. See it on your own with: str(metadata)


## ---- eval=NOT_CRAN-----------------------------------------------------------
# the 'X-TIKA:embedded_resource_path' field
embedded_resource_path <- 
    metadata %>%
    lapply(function(x){ x$'X-TIKA:embedded_resource_path' }) 

embedded_resource_path

## ---- eval=NOT_CRAN-----------------------------------------------------------
content_type <-
    metadata %>%
    lapply(function(x){ x$'Content-Type' }) 

content_type

## ---- eval=NOT_CRAN-----------------------------------------------------------
content <- 
     metadata %>%
    lapply(function(x){ x$'X-TIKA:content' })

str(content)

Try the rtika package in your browser

Any scripts or data that you put into this service are public.

rtika documentation built on May 31, 2023, 8 p.m.