rtika_introduction.R
In rtika: R Interface to 'Apache Tika'

## ----setup, include = FALSE---------------------------------------------------
# only evaluate code if "NOT_CRAN"
NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true")

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

if(NOT_CRAN){
  if(is.na(rtika::tika_jar())){ rtika::install_tika() }  
}


## ----eval=NOT_CRAN------------------------------------------------------------
# 
# library('rtika')
# library('magrittr')
# 
# # Code to get ALL the files in my_path:
# 
# # my_path <- "~"
# # batch <- file.path(my_path,
# #                 list.files(path = my_path,
# #                 recursive = TRUE))
# 
# # pipe the batch into tika_text()
# # to get plain text
# 
# # test files
# batch <- c(
#   system.file("extdata", "jsonlite.pdf", package = "rtika"),
#   system.file("extdata", "curl.pdf", package = "rtika"),
#   system.file("extdata", "table.docx", package = "rtika"),
#   system.file("extdata", "xml2.pdf", package = "rtika"),
#   system.file("extdata", "R-FAQ.html", package = "rtika"),
#   system.file("extdata", "calculator.jpg", package = "rtika"),
#   system.file("extdata", "tika.apache.org.zip", package = "rtika")
# )
# 
# text <-
#     batch %>%
#     tika_text()
# 
# # normal syntax also works:
# # text <- tika_text(batch)
# 

## ----eval=NOT_CRAN------------------------------------------------------------
# # Find which files had an issue
# # Handle them if needed
# batch[which(is.na(text))]

## ----eval=NOT_CRAN------------------------------------------------------------
# length(text)
# 
# search <-
#     text[grep(pattern = ' is ', x = text)]
# 
# length(search)

## ----eval=NOT_CRAN------------------------------------------------------------
# download_directory <- tempfile('rtika_')
# 
# dir.create(download_directory)
# 
# urls <- c('https://tika.apache.org/',
#           'https://cran.rstudio.com/web/packages/keras/keras.pdf')
# 
# downloaded <-
#     urls %>%
#     tika_fetch(download_directory)
# 
# # it will add the appropriate file extension to the downloads
# downloaded
# 

## ----eval=NOT_CRAN------------------------------------------------------------
# # create a directory not already in use.
# my_directory <-
#    tempfile('rtika_')
# 
# dir.create(my_directory)
# 
# # pipe the batch to tika_text()
# batch %>%
# tika_text(threads = 4,
#           return = FALSE,
#           output_dir = my_directory)
# 
# # list all the file locations
# processed_files <- file.path(
#                 normalizePath(my_directory),
#                 list.files(path = my_directory,
#                 recursive = TRUE)
#                 )
# 

## ----eval=NOT_CRAN------------------------------------------------------------
# processed_files

## ----eval=NOT_CRAN------------------------------------------------------------
# library('xml2')
# 
# # get XHTML text
# html <-
#     batch %>%
#     tika_html() %>%
#     lapply(xml2::read_html)
# 
# # parse links from documents
# links <-
#     html %>%
#     lapply(xml2::xml_find_all, '//a') %>%
#     lapply(xml2::xml_attr, 'href')
# 
# sample(links[[1]],10)

## ----eval=NOT_CRAN------------------------------------------------------------
# # Content-Type
# html %>%
# lapply(xml2::xml_find_first, '//meta[@name="Content-Type"]') %>%
# lapply(xml2::xml_attr, 'content') %>%
# unlist()
# 
# # Creation-Date
# html %>%
# lapply(xml2::xml_find_first, '//meta[@name="Creation-Date"]') %>%
# lapply(xml2::xml_attr, 'content') %>%
# unlist()
# 

## ----eval=NOT_CRAN------------------------------------------------------------
# library('jsonlite')
# # batch <- system.file("extdata", "calculator.jpg", package = "rtika")
# 
# # a list of data.frames
# metadata <-
#     batch %>%
#     tika_json() %>%
#     lapply(jsonlite::fromJSON)
# 
# # look at metadata for an image
# str(metadata[[6]])
# 

## ----eval=NOT_CRAN------------------------------------------------------------
# metadata[[6]]$'geo:lat'
# metadata[[6]]$'geo:long'

## ----eval=NOT_CRAN------------------------------------------------------------
# # wget gets a webpage and other files.
# # sys::exec_wait('wget', c('--page-requisites', 'https://tika.apache.org/'))
# # Put it all into a .zip file
# # sys::exec_wait('zip', c('-r', 'tika.apache.org.zip' ,'tika.apache.org'))
# batch <- system.file("extdata", "tika.apache.org.zip", package = "rtika")
# 
# # a list of data.frames
# metadata <-
#     batch %>%
#     tika_json() %>%
#     lapply(jsonlite::fromJSON)
# 
# # The structure is very long. See it on your own with: str(metadata)
# 

## ----eval=NOT_CRAN------------------------------------------------------------
# # the 'X-TIKA:embedded_resource_path' field
# embedded_resource_path <-
#     metadata %>%
#     lapply(function(x){ x$'X-TIKA:embedded_resource_path' })
# 
# embedded_resource_path

## ----eval=NOT_CRAN------------------------------------------------------------
# content_type <-
#     metadata %>%
#     lapply(function(x){ x$'Content-Type' })
# 
# content_type

## ----eval=NOT_CRAN------------------------------------------------------------
# content <-
#      metadata %>%
#     lapply(function(x){ x$'X-TIKA:content' })
# 
# str(content)
#