demo.R
In LexisNexisTools: Working with Files from 'LexisNexis'

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library("kableExtra")

## ----message=FALSE------------------------------------------------------------
library("LexisNexisTools")

## ----eval=FALSE---------------------------------------------------------------
#  lnt_sample()

## ----eval=FALSE---------------------------------------------------------------
#  report <- lnt_rename()

## ----eval=FALSE---------------------------------------------------------------
#  report <- lnt_rename(x = getwd(), report = TRUE)

## ----eval=FALSE---------------------------------------------------------------
#  my_files <- list.files(pattern = ".txt", path = getwd(),
#                         full.names = TRUE, recursive = TRUE, ignore.case = TRUE)
#  report <- lnt_rename(x = my_files, report = TRUE)
#  
#  report

## ----echo=FALSE---------------------------------------------------------------
library(kableExtra)
temp <- paste0(tempfile(), ".TXT")
silent <- file.copy(
  from = system.file("extdata", "sample.TXT", package = "LexisNexisTools"),
  to = temp,
  overwrite = TRUE
)

report <- lnt_rename(x = temp, simulate = FALSE, report = TRUE, verbose = FALSE)
report$name_orig <- "sample.TXT"
newfile <- report$name_new
report$name_new <- basename(report$name_new)
kable(report, format = "markdown")

## ----eval=FALSE---------------------------------------------------------------
#  LNToutput <- lnt_read(x = getwd())

## ----echo=FALSE, message=FALSE------------------------------------------------
LNToutput <- lnt_read(x = newfile)
LNToutput@meta$Source_File <- basename(LNToutput@meta$Source_File)

## ----eval=FALSE---------------------------------------------------------------
#  meta_df <- LNToutput@meta
#  articles_df <- LNToutput@articles
#  paragraphs_df <- LNToutput@paragraphs
#  
#  # Print meta to get an idea of the data
#  head(meta_df, n = 3)
#  

## ----echo=FALSE---------------------------------------------------------------
meta_df <- LNToutput@meta
articles_df <- LNToutput@articles
paragraphs_df <- LNToutput@paragraphs

meta_df$Source_File <- basename(meta_df$Source_File)
# Print meta to get an idea of the data
kable(head(meta_df, n = 3), format = "markdown")


## ----message=FALSE------------------------------------------------------------
meta_articles_df <- lnt_convert(LNToutput, to = "data.frame")

# Or keep the paragraphs
meta_paragraphs_df <- lnt_convert(LNToutput, to = "data.frame", what = "Paragraphs")

## ----eval=FALSE---------------------------------------------------------------
#  rDNA_docs <- lnt_convert(LNToutput, to = "rDNA")
#  
#  quanteda_corpus <- lnt_convert(LNToutput, to = "quanteda")
#  
#  tCorpus <- lnt_convert(LNToutput, to = "corpustools")
#  
#  tidy <- lnt_convert(LNToutput, to = "tidytext")
#  
#  Corpus <- lnt_convert(LNToutput, to = "tm")
#  
#  dbloc <- lnt_convert(LNToutput, to = "SQLite")

## ----eval=FALSE---------------------------------------------------------------
#  # Either provide a LNToutput
#  duplicates_df <- lnt_similarity(LNToutput = LNToutput,
#                                  threshold = 0.97)

## ----results='hide', message=FALSE--------------------------------------------
# Or the important parts separatley
duplicates_df <- lnt_similarity(texts = LNToutput@articles$Article,
                                dates = LNToutput@meta$Date,
                                IDs = LNToutput@articles$ID,
                                threshold = 0.97)


## ----eval=FALSE---------------------------------------------------------------
#  lnt_diff(duplicates_df, min = 0, max = Inf)

## -----------------------------------------------------------------------------
duplicates_df <- duplicates_df[duplicates_df$rel_dist < 0.2]
LNToutput <- LNToutput[!LNToutput@meta$ID %in% duplicates_df$ID_duplicate, ]

## -----------------------------------------------------------------------------
LNToutput[1, ]

## ----eval=FALSE---------------------------------------------------------------
#  #' generate new dataframes without highly similar duplicates
#  meta_df <- LNToutput@meta
#  articles_df <- LNToutput@articles
#  paragraphs_df <- LNToutput@paragraphs
#  
#  # Print e.g., meta to see how the data changed
#  head(meta_df, n = 3)

## ----echo=FALSE---------------------------------------------------------------
meta_df <- LNToutput@meta
articles_df <- LNToutput@articles
paragraphs_df <- LNToutput@paragraphs
kable(head(meta_df, n = 3), format = "markdown")

## -----------------------------------------------------------------------------
lnt_lookup(LNToutput, pattern = "statistical computing")

## -----------------------------------------------------------------------------
LNToutput@meta$stats <- lnt_lookup(LNToutput, pattern = "statistical computing")
LNToutput <- LNToutput[!sapply(LNToutput@meta$stats, is.null), ]
LNToutput

## -----------------------------------------------------------------------------
lnt_lookup(LNToutput, pattern = "stat.*?")

## -----------------------------------------------------------------------------
table(unlist(lnt_lookup(LNToutput, pattern = "stat.+?\\b")))

## ----echo = FALSE-------------------------------------------------------------
unlink("sample.TXT")