## ----echo = FALSE-------------------------------------------------------------
knitr::opts_chunk$set(collapse = TRUE,
comment = "##")
## ----eval=TRUE, message = FALSE-----------------------------------------------
# Load readtext package
library("readtext")
## -----------------------------------------------------------------------------
# Get the data directory from readtext
DATA_DIR <- system.file("extdata/", package = "readtext")
## -----------------------------------------------------------------------------
# Read in all files from a folder
readtext(paste0(DATA_DIR, "/txt/UDHR/*"))
## -----------------------------------------------------------------------------
# Manifestos with docvars from filenames
readtext(paste0(DATA_DIR, "/txt/EU_manifestos/*.txt"),
docvarsfrom = "filenames",
docvarnames = c("unit", "context", "year", "language", "party"),
dvsep = "_",
encoding = "ISO-8859-1")
## -----------------------------------------------------------------------------
# Recurse through subdirectories
readtext(paste0(DATA_DIR, "/txt/movie_reviews/*"))
## -----------------------------------------------------------------------------
# Read in comma-separated values
readtext(paste0(DATA_DIR, "/csv/inaugCorpus.csv"), text_field = "texts")
## -----------------------------------------------------------------------------
# Read in tab-separated values
readtext(paste0(DATA_DIR, "/tsv/dailsample.tsv"), text_field = "speech")
## -----------------------------------------------------------------------------
## Read in JSON data
readtext(paste0(DATA_DIR, "/json/inaugural_sample.json"), text_field = "texts")
## -----------------------------------------------------------------------------
## Read in Universal Declaration of Human Rights pdf files
(rt_pdf <- readtext(paste0(DATA_DIR, "/pdf/UDHR/*.pdf"),
docvarsfrom = "filenames",
docvarnames = c("document", "language"),
sep = "_"))
## -----------------------------------------------------------------------------
## Read in Word data (.docx)
readtext(paste0(DATA_DIR, "/word/*.docx"))
## -----------------------------------------------------------------------------
# Note: Example required: which URL should we use?
## -----------------------------------------------------------------------------
# Note: Archive file required. The only zip archive included in readtext has
# different encodings and is difficult to import (see section 4.2).
## -----------------------------------------------------------------------------
if (require("quanteda")) {
# read in comma-separated values with readtext
rt_csv <- readtext(paste0(DATA_DIR, "/csv/inaugCorpus.csv"), text_field = "texts")
# create quanteda corpus
corpus_csv <- corpus(rt_csv)
summary(corpus_csv, 5)
}
## ---- message = FALSE---------------------------------------------------------
# Load stringi package
require("stringi")
## -----------------------------------------------------------------------------
# Make some text with page numbers
sample_text_a <- "The quick brown fox named Seamus jumps over the lazy dog also named Seamus,
page 1
with the newspaper from a boy named quick Seamus, in his mouth.
page 2
The quicker brown fox jumped over 2 lazy dogs."
sample_text_a
# Remove "page" and respective digit
sample_text_a2 <- unlist(stri_split_fixed(sample_text_a, '\n'), use.names = FALSE)
sample_text_a2 <- stri_replace_all_regex(sample_text_a2, "page \\d*", "")
sample_text_a2 <- stri_trim_both(sample_text_a2)
sample_text_a2 <- sample_text_a2[sample_text_a2 != '']
stri_paste(sample_text_a2, collapse = '\n')
## -----------------------------------------------------------------------------
sample_text_b <- "The quick brown fox named Seamus
- 1 -
jumps over the lazy dog also named Seamus, with
- 2 -
the newspaper from a boy named quick Seamus, in his mouth.
- 33 -
The quicker brown fox jumped over 2 lazy dogs."
sample_text_b
sample_text_b2 <- unlist(stri_split_fixed(sample_text_b, '\n'), use.names = FALSE)
sample_text_b2 <- stri_replace_all_regex(sample_text_b2, "[-] \\d* [-]", "")
sample_text_b2 <- stri_trim_both(sample_text_b2)
sample_text_b2 <- sample_text_b2[sample_text_b2 != '']
stri_paste(sample_text_b2, collapse = '\n')
## -----------------------------------------------------------------------------
# create a temporary directory to extract the .zip file
FILEDIR <- tempdir()
# unzip file
unzip(system.file("extdata", "data_files_encodedtexts.zip", package = "readtext"), exdir = FILEDIR)
## -----------------------------------------------------------------------------
# get encoding from filename
filenames <- list.files(FILEDIR, "^(Indian|UDHR_).*\\.txt$")
head(filenames)
# Strip the extension
filenames <- gsub(".txt$", "", filenames)
parts <- strsplit(filenames, "_")
fileencodings <- sapply(parts, "[", 3)
head(fileencodings)
# Check whether certain file encodings are not supported
notAvailableIndex <- which(!(fileencodings %in% iconvlist()))
fileencodings[notAvailableIndex]
## -----------------------------------------------------------------------------
txts <- readtext(paste0(DATA_DIR, "/data_files_encodedtexts.zip"),
encoding = fileencodings,
docvarsfrom = "filenames",
docvarnames = c("document", "language", "input_encoding"))
print(txts, n = 50)
## -----------------------------------------------------------------------------
if (require("quanteda")) {
corpus_txts <- corpus(txts)
summary(corpus_txts, 5)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.