Nothing
# Startup ----------------------------------------------------------------------
#' @importFrom utils packageVersion
.onAttach <- function(...) {
packageStartupMessage(
"LexisNexisTools Version ",
packageVersion("LexisNexisTools")
)
}
# Class and Methods ------------------------------------------------------------
#' An S4 class to store the three data.frames created with \link{lnt_read}
#'
#' This S4 class stores the output from \link{lnt_read}. Just like a spreadsheet
#' with multiple worksheets, an LNToutput object consist of three data.frames
#' which you can select using \code{@}. This object class is intended to be an
#' intermediate container. As it stores articles and paragraphs in two separate
#' data.frames, nested in an S4 object, the relevant text data is stored twice
#' in almost the same format. This has the advantage, that there is no need to
#' use special characters, such as "\\n" to indicate a new paragraph. However,
#' it makes the files rather big when you save them directly. They should thus
#' usually be subsetted using \code{@} or converted to a different format using
#' \link{lnt_convert}.
#'
#' @slot meta The metadata of the articles read in.
#' @slot articles The article texts and respective IDs.
#' @slot paragraphs The paragraphs (if the data.frame exists) and respective
#' article and paragraph IDs.
#' @name LNToutput
#' @importFrom methods new
setClass(
"LNToutput",
representation(
meta = "data.frame",
articles = "data.frame",
paragraphs = "data.frame"
)
)
#' Methods for LNToutput output objects
#'
#' @param x,object An LNToutput object.
#' @param i Rows of the meta data.frame (default) or values of j.
#' @param j The column you want to use to subset the LNToutput object. Takes
#' character strings.
#' @param invert Invert the selection of i.
#' @param ... not used.
#' @param drop not used (it's here so tests do not complain).
#' @param e1,e2 LNToutput objects which will be combined.
#' @name LNToutput_methods
#' @importFrom tibble tibble
NULL
#' @rdname LNToutput_methods
setMethod("dim",
signature = "LNToutput",
definition = function(x) {
c(
Articles = length(x@meta[[1]]),
Meta_variable = length(x@meta[1, ]),
data.frames = ifelse(
all(is.na(x@paragraphs)),
2,
3
)
)
}
)
#' @rdname LNToutput_methods
setMethod("show",
signature = "LNToutput",
definition = function(object) {
cat("Object of class 'LNToutput':\n")
cat(nrow(object@meta), "articles\n")
cat(nrow(object@paragraphs), "paragraphs\n")
print(object@meta, n = 6)
print(object@articles, n = 6)
print(object@paragraphs, n = 6)
}
)
#' @rdname LNToutput_methods
setMethod("[",
signature = "LNToutput",
definition = function(x, i, j, invert = FALSE, ..., drop = TRUE) {
if (missing(j)) {
x@meta <- x@meta[i, ]
x@articles <- x@articles[i, ]
x@paragraphs <- x@paragraphs[x@paragraphs$Art_ID %in% x@meta$ID, ]
} else {
if (j %in% colnames(x@meta)) {
select <- x@meta$ID[x@meta[[j]] %in% i]
} else if (j %in% colnames(x@articles)) {
select <- x@articles$ID[x@articles[[j]] %in% i]
} else if (j %in% colnames(x@paragraphs)) {
select <- x@paragraphs$Art_ID[x@paragraphs[[j]] %in% i]
select_par <- x@paragraphs$Par_ID[x@paragraphs[[j]] %in% i]
} else {
stop("'j' was not found to be a valid column name.")
}
if (invert) {
select <- x@meta$ID[!x@meta$ID %in% select]
}
x@meta <- x@meta[x@meta$ID %in% select, ]
x@articles <- x@articles[x@articles$ID %in% select, ]
if (exists("select_par")) {
x@paragraphs <- x@paragraphs[x@paragraphs$Par_ID %in% select_par, ]
} else {
x@paragraphs <- x@paragraphs[x@paragraphs$Art_ID %in% select, ]
}
}
return(x)
}
)
#' @rdname LNToutput_methods
setMethod("+",
signature = c("LNToutput", "LNToutput"),
definition = function(e1, e2) {
IDs <- c(e1@meta$ID, (e2@meta$ID + max(e1@meta$ID)))
Par_IDs <- c(
e1@paragraphs$Par_ID,
(e2@paragraphs$Par_ID + max(e1@paragraphs$Par_ID))
)
Art_IDs <- c(
e1@paragraphs$Art_ID,
(e2@paragraphs$Art_ID + max(e1@paragraphs$Art_ID))
)
e1@meta <- rbind(e1@meta, e2@meta)
e1@articles <- rbind(e1@articles, e2@articles)
e1@paragraphs <- rbind(e1@paragraphs, e2@paragraphs)
if (any(duplicated(e1@meta$ID)) |
any(duplicated(e1@paragraphs$Par_ID))) {
warning("After objects were merged, there were duplicated IDs. This was fixed.")
e1@meta$ID <- IDs
e1@articles$ID <- IDs
e1@paragraphs$Art_ID <- Art_IDs
e1@paragraphs$Par_ID <- Par_IDs
}
return(e1)
}
)
# Main Functions ------------------------------------------------------------
#' Read in a LexisNexis file
#'
#' Read a file from LexisNexis in a supported format and convert it to an object
#' of class \link{LNToutput}. Supported formats are TXT, DOC, RTF and PDF files.
#'
#' @param x Name(s) of file(s) or one or multiple directories containing files
#' from LexisNexis to be converted.
#' @param encoding Encoding to be assumed for input files. Defaults to UTF-8
#' (the LexisNexis standard value).
#' @param extract_paragraphs A logical flag indicating if the returned object
#' will include a third data frame with paragraphs.
#' @param convert_date A logical flag indicating if it should be tried to
#' convert the date of each article into Date format. For non-standard dates
#' provided by LexisNexis it might be safer to convert dates afterwards (see
#' \link{lnt_asDate}).
#' @param start_keyword Is used to indicate the beginning of an article. All
#' articles should have the same number of Beginnings, ends and lengths (which
#' indicate the last line of metadata). Use regex expression such as "\\d+ of
#' \\d+ DOCUMENTS$" (which would catch e.g., the format "2 of 100 DOCUMENTS")
#' or "auto" to try all common keywords. Keyword search is case sensitive.
#' @param end_keyword Is used to indicate the end of an article. Works the same
#' way as start_keyword. A common regex would be "^LANGUAGE: " which catches
#' language in all caps at the beginning of the line (usually the last line of
#' an article).
#' @param length_keyword Is used to indicate the end of the metadata. Works the
#' same way as start_keyword and end_keyword. A common regex would be
#' "^LENGTH: " which catches length in all caps at the beginning of the line
#' (usually the last line of the metadata).
#' @param author_keyword A keyword to identify the author(s) in the metadata.
#' @param exclude_lines Lines in which these keywords are found are excluded.
#' Set to \code{character()} if you want to turn off this feature.
#' @param recursive A logical flag indicating whether subdirectories are
#' searched for more files.
#' @param file_type File types/extensions to be included in search for files.
#' @param remove_cover Logical. Should the cover page be removed.
#' @param remove_classification Logical. Should the classification provided by
#' LexisNexis be removed?
#' @param verbose A logical flag indicating whether information should be
#' printed to the screen.
#' @param ... Additional arguments passed on to \link{lnt_asDate}.
#' @return An LNToutput S4 object consisting of 3 data.frames for metadata,
#' articles and paragraphs.
#' @details The function can produce an \link{LNToutput} S4 object with two or
#' three data.frame: meta, containing all meta information such as date,
#' author and headline and articles, containing just the article ID and the
#' text of the articles. When extract_paragraphs is set to TRUE, the output
#' contains a third data.frame, similar to articles but with articles split
#' into paragraphs.
#'
#' When left to 'auto', the keywords will use the following defaults, which
#' should be the standard keywords in all languages used by 'LexisNexis':
#'
#' * \code{start_keyword = "\\d+ of \\d+ DOCUMENTS$| Dokument \\d+ von \\d+$|
#' Document \\d+ de \\d+$"}.
#'
#' * \code{end_keyword = "^LANGUAGE: |^SPRACHE: |^LANGUE: "}.
#'
#' @author Johannes B. Gruber
#' @export
#' @examples
#' LNToutput <- lnt_read(lnt_sample(copy = FALSE))
#' meta.df <- LNToutput@meta
#' articles.df <- LNToutput@articles
#' paragraphs.df <- LNToutput@paragraphs
#' @import stringi
#' @import data.table
#' @importFrom utils tail
#' @importFrom tibble tibble as_tibble
lnt_read <- function(x,
encoding = "UTF-8",
extract_paragraphs = TRUE,
convert_date = TRUE,
start_keyword = "auto",
end_keyword = "auto",
length_keyword = "auto",
author_keyword = "auto",
exclude_lines = "^LOAD-DATE: |^UPDATE: |^GRAFIK: |^GRAPHIC: |^DATELINE: ",
recursive = FALSE,
file_type = c("txt", "rtf", "doc", "pdf", "docx", "zip"),
remove_cover = TRUE,
remove_classification = TRUE,
verbose = TRUE,
...) {
if ("file_pattern" %in% names(list(...))) {
warning("The argument 'file_pattern' was used in earlier versions of the ",
"package and has been replaced by 'file_type'. Please consider ",
"changing your syntax.")
}
files <- get_files(x, recursive = recursive, types = file_type)
# Track the time
if (verbose) {
start_time <- Sys.time()
message(
"Creating LNToutput from ", length(files),
ifelse(length(files) > 1,
" files...",
" file..."
)
)
}
lines <- lnt_read_lines(files, encoding)
if (length(lines$nexis) > 0) {
out_nexis <- lnt_parse_nexis(
lines = lines$nexis,
extract_paragraphs = extract_paragraphs,
convert_date = convert_date,
start_keyword = start_keyword,
end_keyword = end_keyword,
length_keyword = length_keyword,
author_keyword,
exclude_lines = exclude_lines,
verbose = verbose,
start_time = start_time,
...
)
}
if (length(lines$uni) > 0) {
out_uni <- lnt_parse_uni(
lines = lines$uni,
extract_paragraphs = extract_paragraphs,
convert_date = convert_date,
start_keyword = start_keyword,
end_keyword = end_keyword,
length_keyword = length_keyword,
author_keyword,
exclude_lines = exclude_lines,
verbose = verbose,
start_time = start_time,
remove_cover = remove_cover,
remove_classification = remove_classification,
...
)
}
if (exists("out_nexis") & exists("out_uni")) {
out <- out_nexis + out_uni
} else if (exists("out_nexis")) {
out <- out_nexis
} else if (exists("out_uni")) {
out <- out_uni
}
attributes(out)$created <- list(
time = Sys.time(),
Version = packageVersion("LexisNexisTools")
)
return(out)
}
#' lnt_parse_nexis
#'
#' Internal function to parse lines from nexis.com files.
#'
#' @param lines Input lines from \link{lnt_read_lines}.
#' @param start_time Time the task was started (for status messages).
#' @inheritParams lnt_read
#'
#' @noRd
lnt_parse_nexis <- function(lines,
extract_paragraphs,
convert_date,
start_keyword,
end_keyword,
length_keyword,
author_keyword,
exclude_lines,
verbose,
start_time,
...) {
if (start_keyword == "auto") {
start_keyword <- "\\d+ of \\d+ DOCUMENTS$|\\d+ of \\d+ DOCUMENT$|Dokument \\d+ von \\d+$| Document \\d+ de \\d+$"
}
if (end_keyword == "auto") {
end_keyword <- "^LANGUAGE: |^SPRACHE: |^LANGUE: "
}
if (length_keyword == "auto") {
length_keyword <- "^LENGTH: |^L\u00c4NGE: |^LONGUEUR: "
}
if (author_keyword == "auto") {
author_keyword <- "AUTOR: |VON |BYLINE: "
}
status("\t...files loaded", verbose, start_time)
# exclude some lines
if (length(exclude_lines) > 0) {
lines[grep(exclude_lines, lines)] <- ""
}
articles.l <- split(
lines, cumsum(stringi::stri_detect_regex(lines, start_keyword))
)
articles.l[["0"]] <- NULL
names(articles.l) <- NULL
rm(lines)
if (length(articles.l) == 0) {
stop("No articles found in provided file(s)")
}
df.l <- lapply(articles.l, function(a) {
len <- grep(length_keyword, a)[1]
if (!is.na(len)) {
list(
source = names(a)[1],
meta = unname(a[2:len]),
article = unname(a[(len + 1):(length(a) - 1)]),
graphic = FALSE
)
} else {
list(
source = names(a)[1],
meta = NULL,
article = a,
graphic = TRUE
)
}
})
status("\t...articles split", verbose, start_time)
# make data.frame
### length
. <- vapply(df.l, FUN.VALUE = character(1), function(i) {
grep(pattern = length_keyword, x = i$meta, value = TRUE)[1]
})
length.v <- stri_replace_all_regex(., length_keyword, "")
status("\t...lengths extracted", verbose, start_time)
### Newspaper. First non emtpy line
newspaper.v <- vapply(df.l, FUN.VALUE = character(1), function(i) {
grep(
pattern = "^$",
x = i$meta,
value = TRUE,
fixed = FALSE,
invert = TRUE
)[1]
})
# remove if newspaper.v contains Date or Beginning
newspaper.v[grep(
"January|February|March|April|May|June|July|August|September|October|November|December",
newspaper.v
)] <- ""
status("\t...newspapers extracted", verbose, start_time)
### Date
date.v <- vapply(df.l, FUN.VALUE = character(1), function(i) {
. <- stringi::stri_extract_last_regex(
str = i$meta[seq_len(10)],
pattern = "\\w+ \\d+, \\d+|\\d+ \\w+ \\d+|\\d+. \\w+ \\d+"
)
na.omit(.)[1]
})
status("\t...dates extracted", verbose, start_time)
### Author (where available)
author.v <- vapply(df.l, FUN.VALUE = character(1), function(i) {
a <- head(
which(stri_detect_regex(i$meta, pattern = author_keyword)),
n = 1
)
if (length(a) > 0) {
if (!i$meta[a + 1] == "") {
a <- c(a:(a + 1))
}
stringi::stri_join(i$meta[a], collapse = " ")
} else {
""
}
})
author.v[author.v == ""] <- NA
status("\t...authors extracted", verbose, start_time)
### section (where available)
section.v <- vapply(df.l, FUN.VALUE = character(1), function(i) {
grep(pattern = "SECTION: |RUBRIK: ", x = i$meta, value = TRUE)[1]
})
status("\t...sections extracted", verbose, start_time)
### edition (where available)
edition.v <- lapply(seq_along(df.l), function(i) {
date <- grep(date.v[i], x = df.l[[i]]$meta, fixed = TRUE)
if (length(date) == 1) {
d1 <- df.l[[i]]$meta[(date + 1):(date + 2)]
if (!d1[1] == "") {
edition.v <- d1[1]
if (isTRUE(!d1[2] == "")) {
edition.v <- c(edition.v, d1[2])
}
edition.v
} else {
# Alternatively, the edition is sometimes the first non-empty line in the article
edition.v <- grep("edition",
df.l[[i]]$article[!stringi::stri_isempty(str = df.l[[i]]$article)][1],
value = TRUE,
ignore.case = TRUE
)
ifelse(length(edition.v) == 0,
NA,
edition.v
)
}
} else {
NA
}
})
status("\t...editions extracted", verbose, start_time)
### Headline
headline.v <- vapply(seq_along(df.l), FUN.VALUE = character(1), function(i) {
if (!df.l[[i]]$graphic) {
headline <- df.l[[i]]$meta
pattern <- na.omit(c(
length.v[i],
date.v[i],
newspaper.v[i],
author.v[i],
section.v[i],
edition.v[[i]]
))
pattern <- pattern[pattern != ""]
remove.m <- vapply(pattern, FUN.VALUE = matrix(nrow = length(headline)), function(p) {
out <- stringi::stri_detect_fixed(headline, p[1])
if (length(p) > 1) {
out + stringi::stri_detect_fixed(headline, p[2])
} else {
out
}
})
headline[as.logical(rowSums(remove.m, na.rm = TRUE))] <- ""
headline <- stringi::stri_join(headline, collapse = " ")
stri_replace_all_regex(headline, "\\s+", " ")
} else {
""
}
})
status("\t...headlines extracted", verbose, start_time)
if (convert_date) {
date.v <- lnt_asDate(date.v, ...)
status("\t...dates converted", verbose, start_time)
}
# Clean the clutter from objects
author.v <- stri_replace_all_regex(
str = author.v,
pattern = author_keyword,
replacement = ""
)
section.v <- stri_replace_all_regex(
str = section.v,
pattern = "SECTION: |RUBRIK: ",
replacement = ""
)
edition.v <- vapply(edition.v, FUN.VALUE = character(1), function(e) {
out <- stri_paste(e, collapse = " ")
out <- stri_replace_all_regex(out, "\\s+", " ")
out[out == ""] <- NA
stri_trim_both(out)
})
### make data.frame
meta.df <- tibble(
ID = seq_along(df.l),
Source_File = unlist(lapply(df.l, function(i) i[["source"]])),
Newspaper = trimws(newspaper.v, which = "both"),
Date = date.v,
Length = trimws(length.v, which = "both"),
Section = trimws(section.v, which = "both"),
Author = trimws(author.v, which = "both"),
Edition = edition.v,
Headline = trimws(headline.v, which = "both"),
Graphic = unlist(lapply(df.l, function(i) i[["graphic"]]))
)
status("\t...metadata extracted", verbose, start_time)
# Cut of after ends in article
df.l <- lapply(df.l, function(i) {
end <- tail(grep(end_keyword, i$article), n = 1)
if (length(end) > 0) {
i$article <- i$article[1:end - 1]
}
i$article
})
articles.df <- data.frame(
ID = seq_along(df.l),
Article = sapply(df.l, function(i) {
stringi::stri_join(i, collapse = "\n")
}),
stringsAsFactors = FALSE
)
# solves weird covr behaviour
articles.df <- tibble::as_tibble(articles.df)
status("\t...article texts extracted", verbose, start_time)
if (extract_paragraphs) {
# split paragraphs
. <- stringi::stri_split_fixed(
str = articles.df$Article,
pattern = "\n\n",
n = -1L,
omit_empty = TRUE,
simplify = FALSE
)
paragraphs.df <- data.table::rbindlist(lapply(seq_along(.), function(i) {
if (length(.[[i]][!.[[i]] == "\n"]) > 0) {
tibble(
Art_ID = i,
Paragraph = .[[i]][!.[[i]] == "\n"]
)
} else {
tibble(
Art_ID = i,
Paragraph = NA
)
}
}))
paragraphs.df$Par_ID <- seq_len(nrow(paragraphs.df))
paragraphs.df <- paragraphs.df[, c("Art_ID", "Par_ID", "Paragraph")]
if (verbose) {
message("\t...paragraphs extracted [", format(
(Sys.time() - start_time),
digits = 2, nsmall = 2
), "]")
}
} else {
paragraphs.df <- tibble(
Art_ID = NA,
Par_ID = NA,
Paragraph = NA
)
}
# remove unneccesary whitespace (removes \n as well)
articles.df$Article <- stringi::stri_replace_all_regex(
str = articles.df$Article,
pattern = c("\\s+", "^\\s|\\s$"),
replacement = c(" ", ""),
vectorize_all = FALSE
)
paragraphs.df$Paragraph <- stringi::stri_replace_all_regex(
str = paragraphs.df$Paragraph,
pattern = c("\\s+", "^\\s|\\s$"),
replacement = c(" ", ""),
vectorize_all = FALSE
)
if (verbose) {
status("\t...superfluous whitespace removed", verbose, start_time)
message("Elapsed time: ", format(
(Sys.time() - start_time),
digits = 2, nsmall = 2
))
}
out <- new(
"LNToutput",
meta = meta.df,
articles = articles.df,
paragraphs = tibble::as_tibble(paragraphs.df)
)
return(out)
}
#' lnt_parse_uni
#'
#' Internal function to parse lines from Nexis Uni files.
#'
#' @param lines Input lines from \link{lnt_read_lines}.
#' @param start_time Time the task was started (for status messages).
#' @inheritParams lnt_read
#'
#' @importFrom stats setNames
#'
#' @noRd
lnt_parse_uni <- function(lines,
extract_paragraphs,
convert_date,
start_keyword,
end_keyword,
length_keyword,
author_keyword,
exclude_lines,
verbose,
start_time,
remove_cover,
remove_classification,
...) {
if (end_keyword == "auto") {
end_keyword <- "^End of Document$"
if (!any(stringi::stri_detect_regex(lines, end_keyword))) {
end_keyword <- "End of Document$"
}
}
if (length_keyword == "auto") {
length_keyword <- "^Length:\u00a0|^L\u00c4NGE:|^LONGUEUR:"
}
if (author_keyword == "auto") {
author_keyword <- "^Byline:\u00a0|^Author:\u00a0"
}
status("\t...files loaded", verbose, start_time)
# exclude some lines
if (length(exclude_lines) > 0) {
lines[grep(exclude_lines, lines)] <- ""
}
# remove cover page(s) (which are separated by 2 empty lines)
if (remove_cover) {
lines <- lapply(unname(split(lines, names(lines))), function(l) {
l <- rle(l)
if (sum(l$lengths > 2 & l$values == "")) {
l$article <- cumsum(l$lengths > 2 & l$values == "")
l <- l$values[l$article > min(l$article)] #remove before 1st double blank
} else {
l <- l$values
}
l <- l[!l == ""]
return(l)
})
lines <- unlist(lines)
}
articles.l <- split(
lines, cumsum(stringi::stri_detect_regex(lines, end_keyword))
)
if (!length(articles.l)) {
stop("No articles found to parse.")
}
# last "article" only contains End of Document
articles.l[[length(articles.l)]] <- NULL
names(articles.l) <- NULL
if (!length(articles.l)) {
stop("No articles found to parse.")
}
# first article does not contain keyword
if (!stringi::stri_detect_regex(articles.l[[1]][1], end_keyword)) {
articles.l[[1]] <- c(articles.l[[1]][1], articles.l[[1]])
}
if (length(articles.l) == 0) {
stop("No articles found in provided file(s)")
}
# split meta from body
df.l <- lapply(articles.l, function(a) {
split <- which(stringi::stri_detect_regex(a, "^Body$|^Text$"))[1]
if (!is.na(split)) {
list(
source = names(a)[2],
meta = unname(a[2:split]),
article = unname(a[(split + 1):(length(a) - 1)]),
graphic = FALSE
)
} else {
list(
source = names(a)[2],
meta = NULL,
article = a,
graphic = TRUE
)
}
})
status("\t...articles split", verbose, start_time)
# remove classification
if (remove_classification) {
df.l <- lapply(df.l, function(a) {
cls_pos <- tail(grep("^Classification$", a$article), 1L)
if (length(cls_pos) > 0) {
a$article <- a$article[1:cls_pos - 1]
}
return(a)
})
}
# make data.frame
### length
len <- vapply(df.l, FUN.VALUE = character(1), function(i) {
grep(pattern = length_keyword, x = i$meta,
value = TRUE, ignore.case = TRUE)[1]
})
length.v <- trimws(stringi::stri_replace_all_regex(len, length_keyword, ""))
status("\t...lengths extracted", verbose, start_time)
### headline First non emtpy line
headline.v <- vapply(df.l, FUN.VALUE = character(1), function(i) {
grep(
pattern = "^$",
x = i$meta,
value = TRUE,
fixed = FALSE,
invert = TRUE
)[1]
})
status("\t...headlines extracted", verbose, start_time)
### Newspaper. Second non emtpy line
newspaper.v <- vapply(df.l, FUN.VALUE = character(1), function(i) {
grep(
pattern = "^$",
x = i$meta,
value = TRUE,
fixed = FALSE,
invert = TRUE
)[2]
})
status("\t...newspapers extracted", verbose, start_time)
### Date
date.v <- vapply(df.l, FUN.VALUE = character(1), function(i) {
. <- stringi::stri_extract_last_regex(
str = i$meta[seq_len(10)],
pattern = "^\\w+ \\d+, \\d+|^\\d+ \\w+ \\d+|^\\d+. \\w+ \\d+|^\\w+ \\d+. \\w+ \\d+"
)
na.omit(.)[1]
})
status("\t...dates extracted", verbose, start_time)
### Author (where available)
author.v <- vapply(df.l, FUN.VALUE = character(1), function(i) {
grep(pattern = author_keyword, x = i$meta, value = TRUE)[1]
})
author.v <- stringi::stri_replace_all_regex(author.v, author_keyword, "")
author.v[author.v == ""] <- NA
status("\t...authors extracted", verbose, start_time)
### section (where available)
section_keyword <- "^Section:\u00a0"
section.v <- vapply(df.l, FUN.VALUE = character(1), function(i) {
grep(pattern = section_keyword, x = i$meta, value = TRUE)[1]
})
section.v <- stringi::stri_replace_all_regex(section.v, section_keyword, "")
status("\t...sections extracted", verbose, start_time)
### edition (not yet implemented)
edition.v <- NA
status("\t...editions extracted", verbose, start_time)
if (convert_date) {
date.v <- lnt_asDate(date.v, ...)
status("\t...dates converted", verbose, start_time)
}
### make data.frame
meta.df <- tibble(
ID = seq_along(df.l),
Source_File = unlist(lapply(df.l, function(i) i[["source"]])),
Newspaper = trimws(newspaper.v, which = "both"),
Date = date.v,
Length = trimws(length.v, which = "both"),
Section = trimws(section.v, which = "both"),
Author = trimws(author.v, which = "both"),
Edition = edition.v,
Headline = trimws(headline.v, which = "both"),
Graphic = unlist(lapply(df.l, function(i) i[["graphic"]]))
)
status("\t...metadata extracted", verbose, start_time)
par <- unlist(lapply(df.l, "[[", "article"), use.names = FALSE)
paragraphs.df <- data.table::data.table(
Art_ID = rep(seq_along(df.l), lapply(df.l, function(i) length(i$article))),
Par_ID = seq_along(par),
Paragraph = par
)
status("\t...article texts extracted", verbose, start_time)
paragraphs.df$Paragraph <- stringi::stri_replace_all_regex(
str = paragraphs.df$Paragraph,
pattern = c("\\s+|^\\s|\\s$"),
replacement = c(" "),
vectorize_all = FALSE
)
if (verbose) {
status("\t...superfluous whitespace removed", verbose, start_time)
message("Elapsed time: ", format(
(Sys.time() - start_time),
digits = 2, nsmall = 2
))
}
Paragraph <- NULL
Art_ID <- NULL
articles.df <- paragraphs.df[,
list(Article = stri_join(Paragraph, collapse = "\n")),
by = list(ID = Art_ID)]
articles.df <- tibble::as_tibble(articles.df)
paragraphs.df <- tibble::as_tibble(paragraphs.df)
attr(articles.df, ".internal.selfref") <- NULL
attr(paragraphs.df, ".internal.selfref") <- NULL
out <- new(
"LNToutput",
meta = meta.df,
articles = articles.df,
paragraphs = paragraphs.df
)
return(out)
}
#' Assign proper names to LexisNexis files
#'
#' Give proper names to files downloaded from 'LexisNexis' based on search
#' term and period retrieved from each file cover page. This information is not
#' always delivered by LexisNexis though. If the information is not present in
#' the file, new file names will be empty.
#'
#' Warning: This will rename all supported files in a give folder.
#'
#' @param x Can be either a character vector of LexisNexis file name(s),
#' folder name(s) or can be left blank (see example).
#' @param encoding Encoding to be assumed for input files. Defaults to UTF-8
#' (the LexisNexis standard value).
#' @param recursive A logical flag indicating whether subdirectories are
#' searched for more files.
#' @param report A logical flag indicating whether the function will return a
#' report which files were renamed.
#' @param simulate Should the renaming be simulated instead of actually done?
#' This can help prevent accidental renaming of unrelated files which
#' happen to be in the same directory as the files from 'LexisNexis'.
#' @param verbose A logical flag indicating whether information should be
#' printed to the screen.
#' @keywords LexisNexis
#' @author Johannes B. Gruber
#' @export
#' @importFrom stats na.omit
#' @importFrom stringi stri_extract_all_regex stri_join
#' @examples
#' \dontrun{
#' # Copy sample file to current wd
#' lnt_sample()
#'
#' # Rename files in current wd and report back if successful
#'
#' report.df <- lnt_rename(
#' recursive = FALSE,
#' report = TRUE
#' )
#'
#'
#' # Or provide file name(s)
#' my_files <- list.files(
#' pattern = ".txt", full.names = TRUE,
#' recursive = TRUE, ignore.case = TRUE
#' )
#' report.df <- lnt_rename(
#' x = my_files,
#' recursive = FALSE,
#' report = TRUE
#' )
#'
#' # Or provide folder name(s)
#' report.df <- lnt_rename(x = getwd())
#'
#' report.df
#' }
lnt_rename <- function(x,
encoding = "UTF-8",
recursive = FALSE,
report = TRUE,
simulate = TRUE,
verbose = FALSE) {
files <- get_files(x)
# Track the time
start_time <- Sys.time()
if (verbose) message("Checking LN files...")
files <- unique(files)
if (verbose) message(length(files), " files found to process...")
renamed <- data.frame(
name_orig = files,
name_new = character(length = length(files)),
status = character(length = length(files)),
type = tolower(tools::file_ext(files)),
stringsAsFactors = FALSE
)
# start renaming files
tbl <- renamed[renamed$type == "txt", ]
if (nrow(tbl) > 0) {
renamed[renamed$type == "txt", ] <- lnt_rename_txt(
tbl, encoding, simulate, verbose
)
}
tbl <- renamed[renamed$type == "docx", ]
if (nrow(tbl) > 0) {
renamed[renamed$type == "docx", ] <- lnt_rename_docx(
tbl, encoding, simulate, verbose
)
}
other <- !renamed$type %in% c("txt", "docx")
renamed$name_new[other] <- renamed$name_orig[other]
renamed$status[other] <- paste0("not renamed (not implemented for ",
renamed$type[other],
")")
if (verbose) {
message(sum(grepl("^renamed$", renamed$status)),
" files renamed, ",
appendLF = FALSE
)
if (sum(grepl("exists", renamed$status, fixed = TRUE)) > 0) {
message(sum(grepl("exists", renamed$status, fixed = TRUE)),
" not renamed (file already exists), ",
appendLF = FALSE
)
}
if (sum(grepl("empty", renamed$status, fixed = TRUE)) > 0) {
message(sum(grepl("empty", renamed$status, fixed = TRUE)),
" not renamed (no search term or time range found), ",
appendLF = FALSE
)
}
}
renamed$status <- as.factor(renamed$status)
elapsed <- Sys.time() - start_time
if (verbose) {
message(
"in ", format(elapsed, digits = 2, nsmall = 2), appendLF = FALSE
)
}
if (simulate) message(" [changes were only simulated]")
if (report) return(tibble::as_tibble(renamed))
}
lnt_rename_txt <- function(tbl, encoding, simulate, verbose) {
files <- tbl$name_orig
for (i in seq_along(files)) {
# read in the articles
content_v <- readLines(files[i], encoding = encoding, n = 50)
# look for the range of articles
range.v <- content_v[grep("^Download Request:|^Ausgabeauftrag: Dokument", content_v)]
# extract the actual range infromation from line
range.v <- stringi::stri_extract_all_regex(range.v, pattern = "[[:digit:]]|-", simplify = TRUE)
range.v <- stringi::stri_join(range.v, sep = "", collapse = "")
# look for search term
term.v <- content_v[grep("^Terms: |^Begriffe: ", content_v)]
# erase everything in the line exept the actual range
term.v <- gsub("^Terms: |^Begriffe: ", "", term.v)
# split term into elemets seprated by and or OR
term.v <- unlist(strsplit(term.v, split = " AND | and | OR ", fixed = FALSE))
date.v <- term.v[grepl("\\d+-\\d+-\\d+", term.v)]
if (length(date.v) > 1) {
date.v <- paste0(
gsub(
"[^[:digit:]]",
"",
term.v[1]
),
"-",
gsub(
"[^[:digit:]]",
"",
term.v[2]
)
)
term.v <- gsub("[^[:alpha:]]", "", term.v[3])
} else if (length(date.v) > 0) {
date.v <- gsub(
"[^[:digit:]]",
"",
term.v
)[1]
term.v <- gsub("[^[:alpha:]]", "", term.v[2])
} else {
date.v <- "NA"
term.v <- gsub("[^[:alpha:]]", "", term.v)
}
file_name <- sub("[^/]+$", "", files[i]) # take old filepath
file_name <- paste0(file_name, term.v, "_", date.v, "_", range.v, ".txt")
# rename file
if (file.exists(file_name)) {
tbl$name_new[i] <- tbl$name_orig[i]
tbl$status[i] <- "not renamed (file exists)"
} else {
if (file_name == "__.txt") {
tbl$name_new[i] <- file_name
tbl$status[i] <- "not renamed (file is empty)"
} else {
tbl$name_new[i] <- file_name
tbl$status[i] <- "renamed"
if (!simulate) {
file.rename(files[i], file_name)
}
}
}
if (verbose) {
message("\r\t...renaming files ", format(
(100 * (i / length(files))),
digits = 2, nsmall = 2
), "%")
}
}
return(tbl)
}
lnt_rename_docx <- function(tbl, encoding, simulate, verbose) {
check_install("xml2")
for (i in seq_along(tbl$name_orig)) {
# read in file
con <- unz(description = tbl$name_orig[i], filename = "word/document.xml")
content_v <- xml2::read_xml(con, encoding = "utf-8")
rm(con)
content_v <- xml2::xml_find_all(content_v, "//w:p")
content_v <- xml2::xml_text(content_v)
job <- grep("^Job Number:", content_v, value = TRUE)
# extract the actual range infromation from line
if (length(job) > 0) {
job <- stringi::stri_extract_all_regex(
job, pattern = "[[:digit:]]+"
)[[1]]
}
# look for search term
terms <- grep("^Search Terms:", content_v, value = TRUE)[1]
# erase everything in the line exept the actual range
terms <- stringi::stri_replace_all_regex(
terms, "^Search Terms: \\({0,2}", ""
)
# trim long search strings
terms <- stringi::stri_extract_all_words(terms)[[1]][1:3]
terms <- paste(na.omit(terms), collapse = "_")
date <- which(stringi::stri_detect_regex(content_v, "^Narrowed by"))[2]
if (!is.na(date)) {
date <- content_v[date:(date + 3)]
date <- na.omit(unlist(
stringi::stri_extract_all_regex(date, "\\w{3} \\d{2}, \\d{4}")
))
}
date <- paste(na.omit(date), collapse = "-")
file_name <- paste0(dirname(tbl$name_orig[i]),
"/",
paste(terms,
date,
job,
sep = "_"),
".docx")
if (file.exists(file_name)) {
tbl$name_new[i] <- tbl$name_orig[i]
tbl$status[i] <- "not renamed (file exists)"
} else if (grepl("_{2,}.docx", basename(file_name))) {
tbl$name_new[i] <- tbl$name_orig[i]
tbl$status[i] <- "not renamed (file has no cover page)"
} else {
tbl$name_new[i] <- file_name
tbl$status[i] <- "renamed"
if (!simulate) {
file.rename(tbl$name_orig[i], file_name)
}
}
if (verbose) {
message("\r\t...renaming files ", format(
(100 * (i / nrow(tbl))),
digits = 2, nsmall = 2
), "%")
}
}
return(tbl)
}
#' Check for highly similar articles.
#'
#' Check for highly similar articles by comparing all articles published on the
#' same date. This function implements two measures to test if articles are
#' almost identical. The function \link[quanteda.textstats]{textstat_simil}, which
#' compares the word similarity of two given texts; and a relative modification
#' of the generalized Levenshtein (edit) distance implementation in
#' \link[stringdist]{stringdist}. The relative distance is calculated by
#' dividing the string distance by the number of characters in the longer
#' article (resulting in a minimum of 0 if articles are exactly alike and 1 if
#' strings are completely different). Using both methods cancels out the
#' disadvantages of each method: the similarity measure is fast but does not
#' take the word order into account. Two widely different texts could,
#' therefore, be identified as the same, if they employ the exact same
#' vocabulary for some reason. The generalized Levenshtein distance is more
#' accurate but is very computationally demanding, especially if more than two
#' texts are compared at once.
#'
#' @param texts Provide texts to check for similarity.
#' @param dates Provide corresponding dates, same length as \code{text}.
#' @param LNToutput Alternatively to providing texts and dates individually, you
#' can provide an LNToutput object.
#' @param IDs IDs of articles.
#' @param threshold At which threshold of similarity is an article considered a
#' duplicate. Note that lower threshold values will increase the time to
#' calculate the relative difference (as more articles are considered).
#' @param rel_dist Calculate the relative Levenshtein distance between two
#' articles if set to TRUE (can take very long). The main difference between
#' the similarity and distance value is that the distance takes word order
#' into account while similarity employs the bag of words approach.
#' @param length_diff Before calculating the relative distance between articles,
#' the length of the articles in characters is calculated. If the difference
#' surpasses this value, calculation is omitted and the distance will set to
#' NA.
#' @param nthread Maximum number of threads to use (see
#' \link[stringdist]{stringdist-parallelization}).
#' @param max_length If the article is too long, calculation of the relative
#' distance can cause R to crash (see
#' \url{https://github.com/markvanderloo/stringdist/issues/59}). To prevent
#' this you can set a maximum length (longer articles will not be evaluated).
#' @param verbose A logical flag indicating whether information should be
#' printed to the screen.
#' @keywords similarity
#' @return A data.table consisting of information about duplicated
#' articles. Articles with a lower similarity than the threshold will be
#' removed, while all relative distances are still in the returned object.
#' Before you use the duplicated information to subset your dataset, you
#' should, therefore, filter out results with a high relative distance (e.g.
#' larger than 0.2).
#' @author Johannes B. Gruber
#' @export
#' @importFrom stringdist stringdist
#' @importFrom quanteda dfm docnames tokens tokens_remove
#' @importFrom quanteda.textstats textstat_simil
#' @importFrom utils combn
#' @examples
#' \dontrun{
#' # Copy sample file to current wd
#' lnt_sample()
#' }
#'
#' # Convert raw file to LNToutput object
#' LNToutput <- lnt_read(lnt_sample(copy = FALSE))
#'
#' # Test similarity of articles
#' duplicates.df <- lnt_similarity(
#' texts = LNToutput@articles$Article,
#' dates = LNToutput@meta$Date,
#' IDs = LNToutput@articles$ID
#' )
#'
#' # Remove instances with a high relative distance
#' duplicates.df <- duplicates.df[duplicates.df$rel_dist < 0.2]
#'
#' # Create three separate data.frames from cleaned LNToutput object
#' LNToutput <- LNToutput[!LNToutput@meta$ID %in%
#' duplicates.df$ID_duplicate]
#' meta.df <- LNToutput@meta
#' articles.df <- LNToutput@articles
#' paragraphs.df <- LNToutput@paragraphs
lnt_similarity <- function(texts,
dates,
LNToutput,
IDs = NULL,
threshold = 0.99,
rel_dist = TRUE,
length_diff = Inf,
nthread = getOption("sd_num_thread"),
max_length = Inf,
verbose = TRUE) {
call <- match.call(expand.dots = TRUE)
start_time <- Sys.time()
if (missing(LNToutput)) {
if (any(missing(texts), missing(dates))) {
stop("Supply either 'LNToutput' or 'texts' and 'dates'.")
}
if (is.null(IDs)) IDs <- seq_along(texts)
} else {
if (missing(texts)) texts <- LNToutput@articles$Article
if (missing(dates)) dates <- LNToutput@meta$Date
if (is.null(IDs)) IDs <- LNToutput@articles$ID
}
if (!length(texts) == length(dates) | !length(dates) == length(IDs)) {
stop("'texts', 'dates' and 'IDs' need to have the same length.")
}
dates.d <- unique(dates)
dates.d <- dates.d[order(dates.d)]
if (any(is.na(dates.d))) {
warning("You supplied NA values to 'dates'. Those will be ignored.")
dates.d <- dates.d[!is.na(dates.d)]
}
lenghts <- vapply(texts, nchar, FUN.VALUE = 1, USE.NAMES = FALSE)
if (any(lenghts < 1)) {
warning(
"\nAt least one of the supplied texts had length 0. These articles with the following IDs will be ignored: ",
paste(IDs[lenghts == 0], collapse = ", ")
)
texts <- texts[lenghts > 0]
dates <- dates[lenghts > 0]
IDs <- IDs[lenghts > 0]
}
if (exists("LNToutput")) rm(LNToutput)
if (verbose) {
message(
"Checking similiarity for ", length(dates),
" articles over ", length(dates.d), " dates..."
)
}
text_toks <- tokens_remove(tokens(texts), "[^[:alnum:]]", valuetype = "regex")
text_dfm <- dfm(text_toks, tolower = TRUE, verbose = FALSE)
if (verbose) {
message("\t...quanteda dfm constructed for similarity comparison [",
format(
(Sys.time() - start_time),
digits = 2, nsmall = 2
), "].",
appendLF = TRUE
)
}
quanteda::docnames(text_dfm) <- as.character(IDs)
duplicates.df <- lapply(dates.d, function(x) {
if (sum(x == na.omit(dates)) > 1) {
text_dfm_day <- quanteda::dfm_subset(text_dfm, subset = (dates == x))
sim <- stats::as.dist(textstat_simil(
text_dfm_day,
selection = NULL,
method = "cosine",
margin = "documents"
))
. <- t(combn(as.numeric(quanteda::docnames(text_dfm_day)), 2))
colnames(.) <- c("ID_original", "ID_duplicate")
duplicates.df <- data.frame(
.,
Similarity = as.numeric(sim),
stringsAsFactors = FALSE
)
duplicates.df <- duplicates.df[duplicates.df$Similarity > threshold, ]
if (nrow(duplicates.df) > 0) {
duplicates.df$text_original <- texts[match(duplicates.df$ID_original, IDs)]
duplicates.df$text_duplicate <- texts[match(duplicates.df$ID_duplicate, IDs)]
duplicates.df$Date <- dates[match(duplicates.df$ID_duplicate, IDs)]
duplicates.df <- duplicates.df[, c(
"Date",
"ID_original",
"text_original",
"ID_duplicate",
"text_duplicate",
"Similarity"
)]
if (rel_dist) {
duplicates.df$rel_dist <- vapply(seq_len(nrow(duplicates.df)), FUN.VALUE = 1, function(i) {
# length of longer string
mxln <- max(c(nchar(duplicates.df$text_original[i]), nchar(duplicates.df$text_duplicate[i])))
if (isTRUE(
abs(nchar(duplicates.df$text_original[i]) - nchar(duplicates.df$text_duplicate[i])) /
mxln <
length_diff &
max_length > mxln
)) {
stringdist::stringdist(
a = duplicates.df$text_original[i],
b = duplicates.df$text_duplicate[i],
method = "lv",
useBytes = FALSE,
nthread = nthread
) / # string distance
mxln # by length of string
} else {
NA
}
})
}
if (verbose) {
message(
"\r\t...processing date ",
as.character(x),
": ",
length(unique(duplicates.df$ID_duplicate)),
" duplicates found [",
format(
(Sys.time() - start_time), digits = 2, nsmall = 2
), "]. \t\t",
appendLF = FALSE
)
}
return(duplicates.df)
} else {
if (verbose) {
message("\r\t...processing date ", as.character(x), ": 0 duplicates found [",
format(
(Sys.time() - start_time), digits = 2, nsmall = 2
), "]. \t\t",
appendLF = FALSE
)
}
}
} else {
if (verbose) {
message("\r\t...processing date ", as.character(x), ": 0 duplicates found [",
format(
(Sys.time() - start_time), digits = 2, nsmall = 2
), "]. \t\t",
appendLF = FALSE
)
}
}
})
duplicates.df <- data.table::rbindlist(duplicates.df)
class(duplicates.df) <- c(class(duplicates.df), "lnt_sim")
time.elapsed <- Sys.time() - start_time
message(
"\r\nThreshold = ", threshold, "; ",
length(dates.d), " days processed; ",
length(unique(duplicates.df$ID_duplicate)), " duplicates found;",
" in ", format(time.elapsed, digits = 2, nsmall = 2)
)
attributes(duplicates.df)$call <- call
return(duplicates.df)
}
#' @title Convert Strings to dates
#'
#' @description Converts dates from string formats common in LexisNexis to a
#' date object.
#'
#' @param x A character object to be converted.
#' @param format Either "auto" to guess the format based on a common order of
#' day, month and year or provide a custom format (see
#' \link[stringi]{stri_datetime_format} for format options).
#' @param locale A ISO 639-1 locale code (see
#' \url{https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes}).
#' @param ... Not used.
#'
#' @return This function returns an object of class \link{date}.
#' @export
#'
#' @examples
#' LNToutput <- lnt_read(lnt_sample(copy = FALSE), convert_date = FALSE)
#' d <- lnt_asDate(LNToutput@meta$Date)
#' d
#' @importFrom stringi stri_replace_all_fixed stri_replace_all_regex
#' stri_datetime_parse stri_opts_fixed stri_datetime_symbols
#' stri_datetime_format
#' @importFrom utils head menu
lnt_asDate <- function(x,
format = "auto",
locale = "auto",
...) {
dat <- x
formats <- c(
English = "MMMM d,yyyy",
German = "d MMMM yyyy",
Spanish = "d MMMM yyyy",
Dutch = "d MMMM yyyy",
French = "d MMMM yyyy",
Portuguese = "d MMMM yyyy",
Italian = "d MMMM yyyy",
Russian = "d MMMM yyyy"
)
locales <- c(
English = "en",
German = "de",
Spanish = "es",
Dutch = "nl",
French = "fr",
Portuguese = "pt",
Italian = "it",
Russian = "ru"
)
if (!format == "auto") formats <- format
if (!locale == "auto") locales <- locale
for (loc in locales) {
dat <- stri_replace_all_regex(
str = dat,
pattern = paste0(
"\\b",
c(
stri_datetime_symbols(locale = loc)$Weekday,
"PM", "AM", "GMT"
),
"\\b"
),
replacement = "",
vectorize_all = FALSE,
opts_regex = stri_opts_regex(case_insensitive = TRUE)
)
}
dat <- stri_replace_all_regex(
str = dat,
pattern = c(
"[A-Z]{3}$",
"((?:(?:[0-1][0-9])|(?:[2][0-3])|(?:[0-9])):(?:[0-5][0-9])(?::[0-5][0-9])?(?:\\s?(?:am|AM|pm|PM))?)"
),
replacement = "",
vectorize_all = FALSE
)
dat <- stri_replace_all_fixed(
str = dat,
pattern = "Maerz",
replacement = "M\u00c4rz",
vectorize_all = FALSE
)
if (any(
format == "auto",
locale == "auto"
)) {
correct <- mapply(formats, locales, FUN = function(format, locale) {
out <- stringi::stri_datetime_parse(
str = dat,
format = format,
locale = locale,
tz = NULL
)
out <- 1 - sum(is.na(out)) / length(dat)
out * 100
})
most <- head(sort(correct[correct > 0.01], decreasing = TRUE), n = 3)
if (is.na(most[1])) stop("No valid dates found.")
if (most[1] < 100) {
if (interactive()) {
langchoice <- menu(
choices = c(
"Don't convert dates",
paste0(names(most)[1], " (", round(most[1], 2), "%", ")"),
paste0(names(most)[2], " (", round(most[2], 2), "%", ")"),
if (length(most) == 3) paste0(names(most)[3], " (", round(most[3], 2), "%", ")")
),
title = "More than one language was detected. Choose one:"
)
} else {
warning("More than one language was detected. The most likely one was chosen (",
paste0(names(most)[1], " ", round(most[1], 2), "%", ""),
")")
langchoice <- 2
}
} else {
langchoice <- 2
}
if (langchoice > 1) {
format <- formats[names(formats) == names(most)[langchoice - 1]]
locale <- locales[names(locales) == names(most)[langchoice - 1]]
} else {
return(x)
}
}
if (!format[1] %in% formats) {
message("A non-standard format was provided. Conversion is tried but might fail.")
}
dat <- stringi::stri_datetime_parse(
str = dat,
format = format,
tz = "UTC",
locale = locale
)
dat <- as.Date(dat)
return(dat)
}
#' @title Lookup keywords in articles
#'
#' @description This function looks for the provided pattern in the string or
#' LNToutput object. This can be useful, for example, to see which of the
#' keywords you used when retrieving the data was used in each article.
#' @details If an LNToutput object is provided, the function will look for the
#' pattern in the headlines and articles. The returned object is a list of
#' hits. If a regular expression is provided, the returned word will be the
#' actual value from the text.
#' @param x An LNToutput object or a string or vector of strings.
#' @param pattern A character vector of keywords. Word boundaries before and
#' after the keywords are honoured (see \code{word_boundaries}). Regular
#' expression can be used.
#' @param cores The number of CPU cores to use. Use \code{NULL} or \code{1} to
#' turn off.
#' @param case_insensitive If FALSE, the pattern matching is case sensitive and
#' if TRUE, case is ignored during matching.
#' @param unique_pattern If TRUE, duplicated mentions of the same pattern are
#' removed.
#' @param word_boundaries If TRUE or "both", lookup is performed with word
#' boundaries at beginning and end of the pattern (i.e., pattern "protest"
#' will not identify "protesters" etc.). Additionally word boundaries can be
#' either just in front of the pattern ("before") or after the pattern
#' ("after"). FALSE searches without word boundaries.
#' @param verbose A logical flag indicating whether a status bar is printed to
#' the screen.
#' @return A list of keyword hits.
#'
#' @examples
#' # Make LNToutput object from sample
#' LNToutput <- lnt_read(lnt_sample(copy = FALSE))
#'
#' # Lookup keywords
#' LNToutput@meta$Keyword <- lnt_lookup(
#' LNToutput,
#' "statistical computing"
#' )
#'
#' # Keep only articles which mention the keyword
#' LNToutput_stat <- LNToutput[!sapply(LNToutput@meta$Keyword, is.null)]
#'
#' # Convert list of keywords to string
#' LNToutput@meta$Keyword <- sapply(LNToutput@meta$Keyword, toString)
#' @author Johannes Gruber
#' @export
#' @importFrom pbapply pboptions pblapply
#' @importFrom parallel makeCluster stopCluster clusterExport
#' @importFrom stringi stri_join stri_extract_all_regex stri_opts_regex
lnt_lookup <- function(x,
pattern,
case_insensitive = FALSE,
unique_pattern = FALSE,
word_boundaries = c("both", "before", "after"),
cores = NULL,
verbose = TRUE) {
UseMethod("lnt_lookup")
}
#' @rdname lnt_lookup
#' @noRd
#' @export
lnt_lookup.default <- function(x, ...) {
stop("'x' must be either a character vector or LNToutput object.")
}
#' @rdname lnt_lookup
#' @noRd
#' @export
lnt_lookup.LNToutput <- function(x,
pattern,
case_insensitive = FALSE,
unique_pattern = FALSE,
word_boundaries = c("both", "before", "after"),
cores = NULL,
verbose = TRUE) {
IDs <- x@meta$ID
x <- stringi::stri_join(x@meta$Headline,
x@articles$Article,
sep = " \n "
)
names(x) <- IDs
lnt_lookup(x = x,
pattern = pattern,
case_insensitive = case_insensitive,
unique_pattern = unique_pattern,
word_boundaries = word_boundaries,
cores = cores,
verbose = verbose)
}
#' @rdname lnt_lookup
#' @noRd
#' @export
lnt_lookup.character <- function(x,
pattern,
case_insensitive = FALSE,
unique_pattern = FALSE,
word_boundaries = c("both", "before", "after"),
cores = NULL,
verbose = TRUE) {
if (!is.null(word_boundaries) | isFALSE(word_boundaries)) {
if (word_boundaries[1] == "both" | isTRUE(word_boundaries)) {
pattern <- paste0(
"\\b",
pattern,
"\\b"
)
}
if (word_boundaries[1] == "before") {
pattern <- paste0(
"\\b",
pattern
)
}
if (word_boundaries[1] == "after") {
pattern <- paste0(
pattern,
"\\b"
)
}
}
if (!verbose) {
pbapply::pboptions(type = "none")
} else {
pbapply::pboptions(type = "timer")
}
if (isTRUE(cores > 1)) {
cl <- parallel::makeCluster(cores)
force(pattern)
parallel::clusterExport(cl = cl, varlist = "pattern", envir = environment())
} else {
cl <- NULL
}
return <- pbapply::pblapply(x, cl = cl, function(s) {
out <- stringi::stri_extract_all_regex(
str = s,
pattern = pattern,
simplify = FALSE,
omit_no_match = FALSE,
opts_regex = stringi::stri_opts_regex(
case_insensitive = case_insensitive
)
)
out[is.na(out)] <- NULL
if (unique_pattern) {
return(unique(unlist(out)))
} else {
return(unlist(out))
}
})
if (!is.null(cl)) {
parallel::stopCluster(cl)
}
return(return)
}
#' @title Display diff of similar articles
#'
#' @description This function is a wrapper for \link[diffobj]{diffPrint}. It is
#' intended to help performing a manual assessment of the difference between
#' highly similar articles identified via \link{lnt_similarity}.
#'
#' @param x lnt_sim object as returned by \link{lnt_similarity}.
#' @param min Minimum value of rel_dist to include in diff.
#' @param max Maximum value of rel_dist to include in diff.
#' @param n Size of displayed sample.
#' @param output_html Set to TRUE to output html code, e.g. to use for knitting
#' an rmarkdown document to html. Chunk option must be set to
#' \code{results='asis'} in that case.
#' @param ... Currently not used.
#'
#' @examples
#' \dontrun{
#' # Test similarity of articles
#' duplicates.df <- lnt_similarity(
#' LNToutput = lnt_read(lnt_sample(copy = FALSE)),
#' threshold = 0.97
#' )
#'
#' lnt_diff(duplicates.df, min = 0.18, max = 0.30)
#' }
#' @author Johannes Gruber
#' @export
#' @importFrom quanteda tokens
lnt_diff <- function(x,
min = 0.15,
max = 0.3,
n = 25,
output_html = FALSE,
...) {
if (!"lnt_sim" %in% class(x)) {
warning("'x' should be an object returned by lnt_similarity().")
}
check_install("diffobj")
if (!"rel_dist" %in% colnames(x)) {
stop("'x' must contain a column with rel_dist information (see ?lnt_similarity)")
}
x <- x[x$rel_dist > min & x$rel_dist < max, ]
if (nrow(x) < n) {
n <- nrow(x)
}
sample <- sample(x = seq_len(nrow(x)), size = n)
x <- x[sample, ]
x <- x[order(x$rel_dist), ]
for (i in seq_len(nrow(x))) {
original <- unname(unlist(quanteda::tokens(x$text_original[i], what = "sentence")))
duplicate <- unname(unlist(quanteda::tokens(x$text_duplicate[i], what = "sentence")))
diff <- diffobj::diffPrint(
current = original,
target = duplicate,
mode = "sidebyside",
cur.banner = paste("ID:", x$ID_original[i]),
tar.banner = paste0(
"ID: ", x$ID_duplicate[i], ", rel_dist: ",
round(x$rel_dist[i], digits = 2)
),
format = ifelse(output_html, "html", "auto"),
interactive = !output_html
)
print(diff)
}
if (output_html) {
cat(
"<style>", readLines(system.file("css", "diffobj.css", package = "diffobj")),
"</style>"
)
}
}
# Conversion ------------------------------------------------------------
#' Convert LNToutput to other formats
#'
#' Takes output from \link{lnt_read} and converts it to other formats. You can
#' either use \code{lnt_convert()} and choose the output format via \code{to} or
#' use the individual functions directly.
#'
#' @param x An object of class LNToutput.
#' @param to Which format to convert into. Possible values are "rDNA",
#' "corpustools", "tidytext", "tm", "SQLite" and "quanteda".
#' @param what Either "articles" or "paragraphs" to use articles or paragraphs as
#' text in the output object.
#' @param collapse Only has an effect when \code{what = "articles"}. If set to
#' TRUE, an empty line will be added after each paragraphs. Alternatively you
#' can enter a custom string (such as \code{"\\n"} for newline). \code{NULL}
#' or \code{FALSE} turns off this feature.
#' @param file The name of the database to be written to (for lnt2SQLite only).
#' @param ... Passed on to different methods (see details).
#'
#' @details lnt_convert() provides conversion methods into several formats
#' commonly used in prominent R packages for text analysis. Besides the
#' options set here, the ... (ellipsis) is passed on to the individual methods
#' for tuning the outcome:
#'
#' * data.frame, rDNA ... not used.
#'
#' * quanteda ... passed on to [quanteda::corpus()].
#'
#' * corpustools ... passed on to [corpustools::create_tcorpus()].
#'
#' * tm ... passed on to [tm::Corpus()].
#'
#' * tidytext ... passed on to [tidytext::unnest_tokens()].
#'
#' * lnt2SQLite ... passed on to [`RSQLite::dbWriteTable-method()`][`RSQLite::dbWriteTable,SQLiteConnection,character,character-method`].
#'
#' @examples
#' LNToutput <- lnt_read(lnt_sample(copy = FALSE))
#'
#' df <- lnt_convert(LNToutput, to = "data.frame")
#'
#' docs <- lnt_convert(LNToutput, to = "rDNA")
#'
#' corpus <- lnt_convert(LNToutput, to = "quanteda")
#'
#' \dontrun{
#'
#' tCorpus <- lnt_convert(LNToutput, to = "corpustools")
#'
#' tidy <- lnt_convert(LNToutput, to = "tidytext")
#'
#' Corpus <- lnt_convert(LNToutput, to = "tm")
#'
#' dbloc <- lnt_convert(LNToutput, to = "SQLite")
#' }
#'
#' @export
#' @md
lnt_convert <- function(x,
to = "data.frame",
what = "articles",
collapse = FALSE,
file = "LNT.sqlite",
...) {
valid_to <- c("data.frame",
"rDNA",
"quanteda",
"SQLite",
"corpustools",
"tm",
"tidytext")
if (!to %in% valid_to) {
stop(to, " is not a valid selection. Choose one of: ",
paste(valid_to, collapse = ", "))
}
switch(
to,
"data.frame" = return(lnt2df(x, what = what, collapse = collapse)),
"rDNA" = return(lnt2rDNA(x, what = what, collapse = collapse)),
"quanteda" = return(lnt2quanteda(x, what = what, collapse = collapse, ...)),
"SQLite" = return(lnt2SQLite(x, file = file, collapse = collapse, ...)),
"corpustools" = return(lnt2cptools(x, what = what, collapse = collapse, ...)),
"tm" = return(lnt2tm(x, what = what, collapse = collapse, ...)),
"tidytext" = return(lnt2tidy(x, what = what, ...))
)
}
#' @rdname lnt_convert
#' @importFrom tibble as_tibble
#' @export
lnt2df <- function(x, what = "articles", ...) {
what <- tolower(what)
if (!what %in% c("articles", "paragraphs")) {
stop("Choose either \"articles\" or \"paragraphs\" as what argument.")
}
if (what == "articles") {
df <- merge.data.frame(x@meta,
x@articles,
by = "ID"
)
} else if (what == "paragraphs") {
df <- merge.data.frame(
x@paragraphs,
x@meta,
by.x = "Art_ID",
by.y = "ID"
)
}
return(as_tibble(df))
}
#' @rdname lnt_convert
#' @export
lnt2rDNA <- function(x, what = "articles", collapse = TRUE) {
what <- tolower(what)
if (!what %in% c("articles", "paragraphs")) {
stop("Choose either \"articles\" or \"paragraphs\" as what argument.")
}
if (isTRUE(collapse)) {
collapse <- "\n\n"
} else if (is.logical(collapse) && length(collapse) == 1L && !is.na(collapse) && !collapse) {
collapse <- NULL
}
if (what == "articles") {
if (is.null(collapse)) {
text <- x@articles$Article
} else {
text <- vapply(x@meta$ID, FUN.VALUE = character(1), function(id) {
stringi::stri_join(x@paragraphs$Paragraph[x@paragraphs$Art_ID == id],
sep = "",
collapse = collapse,
ignore_null = FALSE
)
})
}
notes <- paste("ID:", x@meta$ID)
order <- seq_along(x@meta$ID)
} else if (what == "paragraphs") {
text <- x@paragraphs$Paragraph
notes <- paste0("Art_ID: ", x@paragraphs$Art_ID, "; Par_ID", x@paragraphs$Par_ID)
order <- match(x@paragraphs$Art_ID, x@meta$ID)
}
dta <- data.frame(
id = seq_along(order),
title = vapply(x@meta$Headline[order],
FUN.VALUE = character(1),
FUN = trim,
n = 197,
USE.NAMES = FALSE),
text = text,
coder = 1,
author = x@meta$Author[order],
source = x@meta$Newspaper[order],
section = x@meta$Section[order],
notes = notes,
type = "newspaper",
date = x@meta$Date[order],
stringsAsFactors = FALSE
)
if (any(grepl("Date", class(dta$date)))) {
dta$date <- as.POSIXct.Date(dta$date)
}
if (any(is.na(dta$date), !any(grepl("POSIXct", class(dta$date))))) {
warning(paste(
"One or more (or all) dates could not be converted to POSIXct.",
"NA entries in 'date' were filled with the system's time and date instead."
))
dta$date <- tryCatch(as.POSIXct(dta$date),
error = function(e) NA
)
dta$date[is.na(dta$date)] <- Sys.time()
if (isTRUE(class(dta$date) == "numeric")) {
dta$date <- as.POSIXct.numeric(dta$date, origin = "1970-01-01")
}
}
dta[is.na(dta)] <- ""
return(dta)
}
#' @rdname lnt_convert
#' @export
#' @importFrom quanteda corpus meta
lnt2quanteda <- function(x, what = "articles", collapse = NULL, ...) {
what <- tolower(what)
if (!what %in% c("articles", "paragraphs")) {
stop("Choose either \"articles\" or \"paragraphs\" as what argument.")
}
if (isTRUE(collapse)) {
collapse <- "\n\n"
} else if (is.logical(collapse) && length(collapse) == 1L && !is.na(collapse) && !collapse) {
collapse <- NULL
}
if (what == "articles") {
if (is.null(collapse)) {
text <- x@articles$Article
} else if (!is.null(collapse)) {
text <- vapply(x@meta$ID, FUN.VALUE = character(1), function(id) {
stringi::stri_join(x@paragraphs$Paragraph[x@paragraphs$Art_ID == id],
sep = "",
collapse = collapse,
ignore_null = FALSE
)
})
}
ID <- x@meta$ID
meta <- x@meta
} else if (what == "paragraphs") {
text <- x@paragraphs$Paragraph
ID <- x@paragraphs$Par_ID
meta <- merge(
x@meta,
x@paragraphs[, c("Art_ID", "Par_ID")],
by.x = "ID",
by.y = "Art_ID",
all.x = FALSE,
all.y = TRUE
)
}
dots <- list(...)
if (any(grepl("meta", names(dots)))) {
metacorpus <- c(list(
converted_from = "LexiNexisTools"),
dots$meta
)
dots$meta <- NULL
} else {
metacorpus <- list(converted_from = "LexiNexisTools")
}
dta <- corpus(
x = text,
docnames = as.character(ID),
docvars = meta,
dots
)
quanteda::meta(dta, names(metacorpus)) <- unname(unlist(unname(metacorpus)))
return(dta)
}
#' @rdname lnt_convert
#' @export
lnt2tm <- function(x, what = "articles", collapse = NULL, ...) {
what <- tolower(what)
if (!what %in% c("articles", "paragraphs")) {
stop("Choose either \"articles\" or \"paragraphs\" as what argument.")
}
check_install("tm")
if (isTRUE(collapse)) {
collapse <- "\n\n"
} else if (is.logical(collapse) && length(collapse) == 1L && !is.na(collapse) && !collapse) {
collapse <- NULL
}
if (what == "articles") {
if (is.null(collapse)) {
text <- x@articles$Article
} else if (!is.null(collapse)) {
text <- vapply(x@meta$ID, FUN.VALUE = character(1), function(id) {
stringi::stri_join(x@paragraphs$Paragraph[x@paragraphs$Art_ID == id],
sep = "",
collapse = collapse,
ignore_null = FALSE
)
})
}
df <- data.frame(
doc_id = x@articles$ID,
text = text
)
df <- merge.data.frame(df,
x@meta,
by.x = "doc_id",
by.y = "ID"
)
} else if (what == "paragraphs") {
df <- data.frame(
doc_id = x@paragraphs$Par_ID,
text = x@paragraphs$Paragraph,
par_id = x@paragraphs$Par_ID
)
df <- merge.data.frame(df,
x@meta,
by.x = "doc_id",
by.y = "ID",
all.x = TRUE
)
}
corpus <- tm::Corpus(tm::DataframeSource(df), ...)
return(corpus)
}
#' @rdname lnt_convert
#' @export
#' @importFrom methods slot slotNames
lnt2cptools <- function(x, what = "articles", ...) {
what <- tolower(what)
if (!what %in% c("articles", "paragraphs")) {
stop("Choose either \"articles\" or \"paragraphs\" as what argument.")
}
check_install("corpustools")
if (what == "articles") {
text <- x@articles$Article
ID <- x@meta$ID
meta <- x@meta
} else if (what == "paragraphs") {
text <- x@paragraphs$Paragraph
ID <- x@paragraphs$Par_ID
meta <- merge(
x@meta,
x@paragraphs[, c("Art_ID", "Par_ID")],
by.x = "ID",
by.y = "Art_ID",
all.x = FALSE,
all.y = TRUE
)
}
tcorpus <- corpustools::create_tcorpus(
x = text,
doc_id = ID,
meta = meta,
...
)
return(tcorpus)
}
#' @rdname lnt_convert
#' @export
lnt2tidy <- function(x, what = "articles", ...) {
what <- tolower(what)
if (!what %in% c("articles", "paragraphs")) {
stop("Choose either \"articles\" or \"paragraphs\" as what argument.")
}
check_install("tidytext")
df <- lnt2df(x, what = what)
if (what == "articles") {
tidy <- tidytext::unnest_tokens(
tbl = df,
input = "Article",
output = "Token",
...
)
} else if (what == "paragraphs") {
tidy <- tidytext::unnest_tokens(
tbl = df,
input = "Paragraph",
output = "Token",
...
)
}
return(tidy)
}
#' @rdname lnt_convert
#' @export
#' @importFrom methods slot slotNames
lnt2SQLite <- function(x, file = "LNT.sqlite", ...) {
check_install("RSQLite")
db <- RSQLite::dbConnect(RSQLite::SQLite(), file)
for (i in slotNames(x)) {
RSQLite::dbWriteTable(
conn = db,
name = i,
value = slot(x, i),
...
)
}
on.exit(RSQLite::dbDisconnect(db))
return(db)
}
#' Convert LNToutput to other formats
#'
#' Takes output from \link{lnt_read} and converts chosen articles to a BibTeX
#' citation.
#'
#' @param x An object of class LNToutput.
#' @param art_id The ID(s) of the article(s) to convert.
#' @param ... unused.
#'
#' @importFrom tools toTitleCase
#' @export
#'
#' @examples
#' LNToutput <- lnt_read(lnt_sample(copy = FALSE))
#'
#' bib <- lnt2bibtex(LNToutput, art_id = 1)
lnt2bibtex <- function(x, art_id, ...) {
dat <- x[x@meta$ID %in% art_id]
out <- lapply(seq_len(nrow(dat)), function(i) {
meta <- dat[i]@meta
bib <- c(
"@article{",
paste0(" author = {", tools::toTitleCase(tolower(meta$Author)), "},"),
paste0(" year = {", meta$Date, "},"),
paste0(" title = {", meta$Headline, "},"),
paste0(" volume = {", format(meta$Date, "%Y"), "},"),
paste0(" journal = {", meta$Newspaper, "}"),
"}"
)
attr(bib, "names") <- c(
"",
"author",
"year",
"title",
"volume",
"journal",
""
)
class(bib) <- "Bibtex"
return(bib)
})
if (length(out) > 1) {
return(out)
} else {
return(out[[1]])
}
}
# Miscellaneous ------------------------------------------------------------
#' Title
#'
#' @param pkg
#'
#' @noRd
#'
#' @importFrom utils install.packages menu
check_install <- function(pkg) {
if (!requireNamespace(pkg, quietly = TRUE)) {
if (interactive()) {
message(
"Package \"",
pkg,
"\" is needed for this function to work. ",
"Should I install it for you?"
)
installchoice <- menu(c("yes", "no"))
if (installchoice == 1) install.packages(pkgs = pkg)
} else {
stop("Package \"", pkg, "\" is needed for this function to work.",
" Please install it.",
call. = FALSE
)
}
}
}
#' @title Adds or replaces articles
#'
#' @description This functions adds a dataframe to a slot in an LNToutput object
#' or overwrites existing entries. The main use of the function is to add an
#' extract of one of the data.frames back to an LNToutput object after
#' operations were performed on it.
#' @details Note, that when adding paragraphs, the Par_ID column is used to
#' determine if entries are already present in the set. For the other data
#' frames the article ID is used.
#' @param to an LNToutput object to which something should be added.
#' @param what A data.frame which is added.
#' @param where Either "meta", "articles" or "paragraphs" to indicate the slot
#' to which data is added.
#' @param replace If TRUE, will overwrite entries which have the same ID as
#'
#' @examples
#' # Make LNToutput object from sample
#' LNToutput <- lnt_read(lnt_sample(copy = FALSE))
#'
#' # extract meta and make corrections
#' correction <- LNToutput@meta[grepl("Wikipedia", LNToutput@meta$Headline), ]
#' correction$Newspaper <- "Wikipedia"
#'
#' # replace corrected meta information
#' LNToutput <- lnt_add(to = LNToutput, what = correction, where = "meta", replace = TRUE)
#' @author Johannes Gruber
#' @export
#' @importFrom methods slot
lnt_add <- function(to,
what,
where = "meta",
replace = TRUE) {
if (where %in% c("meta", "articles")) {
temp <- slot(to, where)
if (any(what$ID %in% temp$ID)) {
if (replace) {
update <- what$ID %in% temp$ID
temp <- temp[!temp$ID %in% what$ID, ]
temp <- rbind(
temp,
what[update, ]
)
if (length(which(!update)) > 0) {
if (!what$ID[!update] %in% slot(to, c("meta", "articles")[!c("meta", "articles") %in% where])) {
warning("Some or all entries you added have no equivalent in other slots of \"to.\"")
}
temp <- rbind(
temp,
what[!update, ]
)
}
temp <- temp[order(temp$ID), ]
message(sum(update), " entries in ", where, " replaced, ", sum(!update), " newly added.")
} else {
update <- !what$ID %in% temp$ID
temp <- rbind(temp, what[update, ])
temp <- temp[order(temp$ID), ]
message(sum(update), " entries added to ", where, ", ", sum(!update), " already present.")
}
} else {
temp <- rbind(temp, what)
temp <- temp[order(temp$ID), ]
message(nrow(what), " entries added to ", where, ".")
}
} else if (where %in% "paragraphs") {
temp <- slot(to, where)
if (any(what$Par_ID %in% temp$Par_ID)) {
if (replace) {
update <- what$Par_ID %in% temp$Par_ID
temp <- temp[!temp$Par_ID %in% what$Par_ID, ]
temp <- rbind(temp, what[update, ])
temp <- temp[order(temp$Par_ID), ]
message(sum(update), " entries in ", where, " replaced, ", sum(!update), " newly added.")
} else {
update <- !what$Par_ID %in% temp$Par_ID
temp <- rbind(temp, what[update, ])
temp <- temp[order(temp$Par_ID), ]
message(sum(update), " entries added to ", where, ", ", sum(!update), " already present.")
}
} else {
temp <- rbind(temp, what[update, ])
temp <- temp[order(temp$Par_ID), ]
message(nrow(what), " entries added to ", where, ".")
}
} else {
stop("Choose either 'meta', 'articles' or 'paragraphs' as 'to' argument.")
}
if (where %in% "meta") {
to@meta <- temp
} else if (where %in% "articles") {
to@articles <- temp
} else if (where %in% "paragraphs") {
to@paragraphs <- temp
}
if (!isTRUE(
all.equal(length(to@meta$ID), length(to@articles$ID), length(unique(to@paragraphs$Art_ID)))
)) {
warning("Returned object is out of balance (one slot has more or less entries than another.")
}
return(to)
}
#' Provides a small sample TXT/DOCX file
#'
#' Copies a small TXT sample file (as used by the old Nexis) or a DOCX (as used
#' by Nexis Uni or Lexis Advance) to the current working directory and returns
#' the location of this newly created file. The content of the file is made up
#' or copied from Wikipedia since real articles from LexisNexis fall under
#' copyright laws and can not be shared.
#'
#' A small sample database to test the functions of LexisNexisTools
#'
#' @param format Either "txt" to get the sample.TXT file or "docx" to get the
#' format used by Nexis Uni.
#' @param overwrite Should the sample file be overwritten if found in the
#' current working directory?
#' @param verbose Display warning message if file exists in current wd.
#' @param path The destination path for the sample file (current working
#' directory if \code{NULL})
#' @param copy Logical. Should the file be copied to path/working directory? If
#' \code{FALSE}, the function only returns the location of the sample file.
#'
#' @examples
#' \dontrun{
#' lnt_sample()
#' }
#' @author Johannes Gruber
#' @export
lnt_sample <- function(format = "txt",
overwrite = FALSE,
verbose = TRUE,
path = NULL,
copy = TRUE) {
if (is.null(path)) {
path <- getwd()
}
f <- switch (tolower(format),
txt = "sample.TXT",
docx = "sample.DOCX"
)
if (is.null(f)) stop("Choose either \"txt\" or \"docx\" as format.")
if (copy) {
to <- paste0(path, "/", f)
if (all(file.exists(paste0(path, "/", f)), !overwrite)) {
if (verbose) {
warning(
"Sample file exists in wd. Use overwrite = TRUE to create fresh sample file."
)
}
} else {
file.copy(
from = system.file("extdata", f, package = "LexisNexisTools"),
to = to,
overwrite = TRUE
)
}
} else {
to <- system.file("extdata", f, package = "LexisNexisTools")
}
return(to)
}
#' Status message
#'
#' Internal function to print status messages
#'
#' @param m Message to be included in the status.
#' @param v Verbise argument passed on from other functions.
#' @param start_time Start time, to calculate time elapsed since start.
#'
#' @noRd
status <- function(m, v, start_time) {
if (v) {
message(m, " [", format(
(Sys.time() - start_time),
digits = 2, nsmall = 2
), "]")
}
}
#' Truncate
#'
#' Internal function, used to truncate text
#'
#' @param x A character string
#' @param n Max number of characters to truncate to. Value \code{Inf} turns off
#' truncation.
#' @param e String added at the end of x to signal it was truncated.
#'
#' @noRd
#' @author Johannes B. Gruber
trim <- function(x, n, e = "...") {
x <- ifelse(nchar(x) > n,
paste0(
gsub(
"\\s+$", "",
strtrim(x, width = n)
),
e
),
x
)
x[is.na(x)] <- ""
return(x)
}
#' Get files
#'
#' Find files from LexisNexis in folder(s).
#'
#' @param x character, name or names of file(s) or folder(s) to be searched.
#' @param types file types/extensions to be searched.
#' @param recursive logical. Should the listing recurse into directories?
#'
#' @importFrom stringi stri_replace_all_fixed
#' @importFrom tools file_ext
#' @importFrom utils unzip
#'
#' @noRd
get_files <- function(x,
types = c("txt", "rtf", "doc", "pdf", "docx", "zip"),
recursive = TRUE) {
# Check how files are provided
# 1. nothing (search wd)
# 2. file or files
# 3. folder name(s)
if (missing(x)) {
message("No path was given. Should files",
"in working directory be renamed? [y/n]")
if (interactive()) {
if (menu(c("yes", "no")) == 1) {
x <- getwd()
} else {
stop("Aborted by user")
}
} else {
stop("No path was given as x.")
}
}
if (all(tolower(tools::file_ext(x)) %in% types)) {
files <- x
} else if (any(tolower(tools::file_ext(x)) %in% types)) {
warning("Not all provided files were TXT, DOC, RTF, PDF or DOCX files. Other formats are ignored.")
files <- x[tolower(tools::file_ext(x)) %in% types]
} else if (any(dir.exists(x))) {
if (length(x) > 1) {
files <- unlist(lapply(x, function(f) {
list.files(
path = f,
full.names = TRUE,
recursive = recursive
)
}))
files <- files[tolower(tools::file_ext(files)) %in% types]
} else {
files <- list.files(
path = x,
full.names = TRUE,
recursive = recursive
)
files <- files[tolower(tools::file_ext(files)) %in% types]
}
} else {
stop("Provide either file name(s) ending on ",
paste(types, collapse = ", "),
" or folder name(s) to x or leave blank to search wd.")
}
if (length(files) > 0) {
zips <- tolower(tools::file_ext(files)) == "zip"
if (any(zips)) {
temp <- paste0(tempdir(), "/zips")
lapply(files[zips], unzip, exdir = temp)
files <- c(files[!zips],
get_files(temp))
}
return(files)
} else {
stop("No ",
paste(types, collapse = ", "),
" files found.")
}
}
#' Read files into lines
#'
#' Internal function, used read files of differnt formats
#'
#' @param files character, name or names of files to be read.
#' @param encoding Encoding to be assumed for input files.
#'
#' @importFrom stringi stri_extract_last_regex stri_read_lines stri_paste
#'
#' @noRd
#' @author Johannes B. Gruber
#'
lnt_read_lines <- function(files,
encoding) {
files <- split(files, tolower(stri_extract_last_regex(files, ".{4}$")))
### read in txt file
if (length(files$.txt) > 0) {
if (length(files$.txt) > 1) {
lines_txt <- unlist(lapply(files$.txt, function(f) {
out <- stri_read_lines(f, encoding = encoding)
names(out) <- rep(f, times = length(out))
out
}))
} else {
lines_txt <- stri_read_lines(files$.txt, encoding = encoding)
names(lines_txt) <- rep(files$.txt, times = length(lines_txt))
}
} else {
lines_txt <- character()
}
### read in doc file
if (length(files$.doc) > 0) {
check_install("striprtf")
if (length(files$.doc) > 1) {
# ignore lock files
files$.doc <- files$.doc[!grepl("^~\\$", basename(files$.doc))]
lines_doc <- unlist(lapply(files$.doc, function(f) {
out <- striprtf::read_rtf(f)
names(out) <- rep(f, times = length(out))
out
}))
} else {
lines_doc <- striprtf::read_rtf(files$.doc)
names(lines_doc) <- rep(files$.doc, times = length(lines_doc))
}
} else {
lines_doc <- character()
}
### read in rtf file
if (length(files$.rtf) > 0) {
check_install("striprtf")
if (length(files$.rtf) > 1) {
lines_rtf <- unlist(lapply(files$.rtf, function(f) {
out <- striprtf::read_rtf(f)
names(out) <- rep(f, times = length(out))
out
}))
} else {
lines_rtf <- striprtf::read_rtf(files$.rtf)
names(lines_rtf) <- rep(files$.rtf, times = length(lines_rtf))
}
} else {
lines_rtf <- character()
}
### read in pdf file
if (length(files$.pdf) > 0) {
check_install("pdftools")
if (length(files$.pdf) > 1) {
lines_pdf <- unlist(lapply(files$.pdf, function(f) {
out <- pdftools::pdf_text(f)
out <- stri_paste(out, collapse = "\n")
out <- unlist(stri_split_fixed(out, "\n"))
# remove page number
out <- out[!stri_detect_regex(out, "^\\s{75,}")]
# remove page break
out <- out[!stri_detect_regex(out, "^$")]
names(out) <- rep(f, times = length(out))
out
}))
} else {
lines_pdf <- pdftools::pdf_text(files$.pdf)
lines_pdf <- stri_paste(lines_pdf, collapse = "\n")
lines_pdf <- unlist(stri_split_fixed(lines_pdf, "\n"))
lines_pdf <- lines_pdf[!stri_detect_regex(lines_pdf, "^\\s{75,}")]
lines_pdf <- lines_pdf[!stri_detect_regex(lines_pdf, "^$")]
names(lines_pdf) <- rep(files$.pdf, times = length(lines_pdf))
}
warning("Reading PDFs is experimental. Extracting paragraphs from PDFs does ",
"not work correctly. Page headers end up in articles.")
} else {
lines_pdf <- character()
}
### read in docx (nexis uni)
if (length(files$docx) > 0) {
check_install("xml2")
if (length(files$docx) > 1) {
# ignore lock files
files$docx <- files$docx[!grepl("^~\\$", basename(files$docx))]
lines_docx <- unlist(lapply(files$docx, function(f) {
con <- unz(description = f, filename = "word/document.xml")
out <- xml2::read_xml(con, encoding = "utf-8")
rm(con)
out <- xml2::xml_find_all(out, "//w:p")
out <- xml2::xml_text(out)
names(out) <- rep(f, times = length(out))
out
}))
} else {
con <- unz(description = files$docx, filename = "word/document.xml")
lines_docx <- xml2::read_xml(con)
rm(con)
lines_docx <- xml2::xml_find_all(lines_docx, "//w:p")
lines_docx <- xml2::xml_text(lines_docx)
names(lines_docx) <- rep(files$docx, times = length(lines_docx))
}
} else {
lines_docx <- character()
}
return(list(nexis = c(lines_txt, lines_doc, lines_rtf, lines_pdf),
uni = lines_docx))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.