Nothing
## ----setup, include = FALSE---------------------------------------------------
# only evaluate code if "NOT_CRAN"
NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true")
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
if(NOT_CRAN){
if(is.na(rtika::tika_jar())){ rtika::install_tika() }
}
## ----eval=NOT_CRAN------------------------------------------------------------
#
# library('rtika')
# library('magrittr')
#
# # Code to get ALL the files in my_path:
#
# # my_path <- "~"
# # batch <- file.path(my_path,
# # list.files(path = my_path,
# # recursive = TRUE))
#
# # pipe the batch into tika_text()
# # to get plain text
#
# # test files
# batch <- c(
# system.file("extdata", "jsonlite.pdf", package = "rtika"),
# system.file("extdata", "curl.pdf", package = "rtika"),
# system.file("extdata", "table.docx", package = "rtika"),
# system.file("extdata", "xml2.pdf", package = "rtika"),
# system.file("extdata", "R-FAQ.html", package = "rtika"),
# system.file("extdata", "calculator.jpg", package = "rtika"),
# system.file("extdata", "tika.apache.org.zip", package = "rtika")
# )
#
# text <-
# batch %>%
# tika_text()
#
# # normal syntax also works:
# # text <- tika_text(batch)
#
## ----eval=NOT_CRAN------------------------------------------------------------
# # Find which files had an issue
# # Handle them if needed
# batch[which(is.na(text))]
## ----eval=NOT_CRAN------------------------------------------------------------
# length(text)
#
# search <-
# text[grep(pattern = ' is ', x = text)]
#
# length(search)
## ----eval=NOT_CRAN------------------------------------------------------------
# download_directory <- tempfile('rtika_')
#
# dir.create(download_directory)
#
# urls <- c('https://tika.apache.org/',
# 'https://cran.rstudio.com/web/packages/keras/keras.pdf')
#
# downloaded <-
# urls %>%
# tika_fetch(download_directory)
#
# # it will add the appropriate file extension to the downloads
# downloaded
#
## ----eval=NOT_CRAN------------------------------------------------------------
# # create a directory not already in use.
# my_directory <-
# tempfile('rtika_')
#
# dir.create(my_directory)
#
# # pipe the batch to tika_text()
# batch %>%
# tika_text(threads = 4,
# return = FALSE,
# output_dir = my_directory)
#
# # list all the file locations
# processed_files <- file.path(
# normalizePath(my_directory),
# list.files(path = my_directory,
# recursive = TRUE)
# )
#
## ----eval=NOT_CRAN------------------------------------------------------------
# processed_files
## ----eval=NOT_CRAN------------------------------------------------------------
# library('xml2')
#
# # get XHTML text
# html <-
# batch %>%
# tika_html() %>%
# lapply(xml2::read_html)
#
# # parse links from documents
# links <-
# html %>%
# lapply(xml2::xml_find_all, '//a') %>%
# lapply(xml2::xml_attr, 'href')
#
# sample(links[[1]],10)
## ----eval=NOT_CRAN------------------------------------------------------------
# # Content-Type
# html %>%
# lapply(xml2::xml_find_first, '//meta[@name="Content-Type"]') %>%
# lapply(xml2::xml_attr, 'content') %>%
# unlist()
#
# # Creation-Date
# html %>%
# lapply(xml2::xml_find_first, '//meta[@name="Creation-Date"]') %>%
# lapply(xml2::xml_attr, 'content') %>%
# unlist()
#
## ----eval=NOT_CRAN------------------------------------------------------------
# library('jsonlite')
# # batch <- system.file("extdata", "calculator.jpg", package = "rtika")
#
# # a list of data.frames
# metadata <-
# batch %>%
# tika_json() %>%
# lapply(jsonlite::fromJSON)
#
# # look at metadata for an image
# str(metadata[[6]])
#
## ----eval=NOT_CRAN------------------------------------------------------------
# metadata[[6]]$'geo:lat'
# metadata[[6]]$'geo:long'
## ----eval=NOT_CRAN------------------------------------------------------------
# # wget gets a webpage and other files.
# # sys::exec_wait('wget', c('--page-requisites', 'https://tika.apache.org/'))
# # Put it all into a .zip file
# # sys::exec_wait('zip', c('-r', 'tika.apache.org.zip' ,'tika.apache.org'))
# batch <- system.file("extdata", "tika.apache.org.zip", package = "rtika")
#
# # a list of data.frames
# metadata <-
# batch %>%
# tika_json() %>%
# lapply(jsonlite::fromJSON)
#
# # The structure is very long. See it on your own with: str(metadata)
#
## ----eval=NOT_CRAN------------------------------------------------------------
# # the 'X-TIKA:embedded_resource_path' field
# embedded_resource_path <-
# metadata %>%
# lapply(function(x){ x$'X-TIKA:embedded_resource_path' })
#
# embedded_resource_path
## ----eval=NOT_CRAN------------------------------------------------------------
# content_type <-
# metadata %>%
# lapply(function(x){ x$'Content-Type' })
#
# content_type
## ----eval=NOT_CRAN------------------------------------------------------------
# content <-
# metadata %>%
# lapply(function(x){ x$'X-TIKA:content' })
#
# str(content)
#
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.