#' Transform words into their base form
#'
#' @param text_vec A vector of text documents.
#' @param dict A dictionary to use to find base forms. See
#' [lexicon::hash_lemmas()] for the required structure.
#'
#' @return The vector with terms transformed into their base form, when
#' possible.
#'
lemmatize <- function(text_vec, dict = lexicon::hash_lemmas) {
dict <- setNames(lexicon::hash_lemmas$lemma, lexicon::hash_lemmas$token)
separator <- "_tagseparator_"
terms <- paste(text_vec, separator, collapse = " ")
terms <- gsub(sprintf(" *%s$", separator), "", terms, perl = TRUE) %>%
stringr::str_split("\\b") %>%
unlist()
terms <- terms[!(terms %in% c("", " "))]
terms.lower <- tolower(terms)
output <- ifelse(
terms.lower %in% names(dict),
dict[terms.lower], terms
) %>%
paste(collapse = " ") %>%
stringr::str_split(sprintf(" *%s *", separator)) %>%
unlist()
output <- gsub("\\s+", " ", output, perl = TRUE)
output <- gsub("^\\s+|\\s+$", "", output, perl = TRUE)
replace(output, output == "NA", NA)
}
#' Clean up text into tokens
#'
#' Removes stop words, punctuation, auxiliary verbs. Lemmatizes text. Changes to
#' lower-case.
#'
#' @param corpus A vector of text documents.
#'
#' @return A cleaned-up vector of text documents.
#'
tokenize_text <- function(corpus) {
message("- tokenizing text...")
stopwords <- tm::stopwords("english")
tictoc::tic()
corpus <- tolower(corpus)
corpus <- gsub("-", "_", corpus, fixed = TRUE)
corpus <- tm::removeWords(corpus, tm::stopwords("english"))
corpus <- gsub("\'(s|re|t|d)?\\b", "", corpus, perl = TRUE)
corpus <- gsub("_", " ", corpus, fixed = TRUE)
corpus <- gsub("[^\\w\\d\\s]+", " ", corpus, perl = TRUE)
corpus <- lemmatize(corpus)
tictoc::toc()
corpus
}
#' Tokenize authors in records
#'
#' Use different tokenization approaches based on author field format.
#'
#' @param authors A vector of author fields from an annotation data set.
#'
#' @return The tokenized author list.
#'
tokenize_authors <- function(authors) {
message("- tokenizing authors")
tictoc::tic()
ids <- 1:length(authors)
with.comma <- stringr::str_detect(authors, ",")
authors <- authors %>% stringr::str_squish()
output <- parallel::mclapply(1:length(authors), function(i) {
if (is.na(with.comma[i])) {
NA
} # No authors listed
else if (with.comma[i] == TRUE) { # Pubmed or WOS style author list
authors[i] %>%
stringr::str_remove_all("[^\\w ,;]") %>%
stringr::str_replace_all("(?<=,)[ \\-\\w]+?(?:(?=;)|$)", function(x) {
paste0(stringr::str_extract_all(x, "\\b\\w")[[1]], collapse = "")
}) %>%
stringr::str_replace_all(",", "_") %>%
stringr::str_remove_all(" +")
} else { # IEEE style author list
authors[i] %>%
stringr::str_remove_all("[^\\w\\.;]") %>% # remove non letters and other characters
stringr::str_replace_all("[^;]+(?:(?=;)|$)", function(x) { # extract names between ;
stringr::str_replace(x, "([\\w \\.]+)\\.([\\w ]+)", "\\2_\\1") # use the rightmost dot to separate first and last names
}) %>%
stringr::str_remove_all("\\.")
}
}) %>%
unlist() %>%
stringr::str_replace_all("; *", " ")
tictoc::toc()
output
}
#' Tokenize keywords in records
#'
#' Clean up the keyword fields in the records.
#'
#' @param keywords A vector of keywords fields from an annotation data set.
#'
#' @return The tokenized keyword list.
#'
tokenize_keywords <- function(keywords) {
keywords %>%
stringr::str_to_lower() %>%
stringr::str_replace_all(c(
"\\s*;\\s*" = ";",
"[^;\\w]+" = "_",
";" = " "
))
}
#' Tokenize MESH keywords in records
#'
#' Clean up the keyword fields in the records.
#'
#' @param mesh A vector of MESH keywords fields from an annotation data set.
#'
#' @return The tokenized MESH keyword list.
#'
tokenize_MESH <- function(mesh) {
message("- tokenizing Mesh terms")
tictoc::tic()
output <- mesh %>%
stringr::str_replace_all(c(" *; *" = ";", "[\\(\\)]" = "", "[ ,\\-]+" = "_", "&" = "and")) %>%
stringr::str_replace_all("(?:(?<=;)|^)[^;]+/[^;]+(?:(?=;)|$)", function(x) {
x <- stringr::str_split(x, "/")[[1]]
paste(c(x[1], paste(x[1], x[-1], sep = ".sh.")), collapse = ";")
}) %>%
stringr::str_replace_all(";", " ") %>%
stringr::str_squish()
tictoc::toc()
output
}
#' Convert a vector of text documents into a Document Term Matrix
#'
#' A Document Term Matrix (DTM) is a structure describing the association of a
#' term to a document. In this case, we used a binary matrix with ones if a term
#' is present in a document and one otherwise.
#'
#' Before computing the DTM, document terms are cleaned, tokenized and
#' lemmatized, and stop-words are removed.
#'
#' To reduce noise, only terms that appear in a fraction of documents higher
#' than \code{min.freq} are considered. The function also uses cosine similarity
#' to identify relevant subclusters of related terms or redundant ones.
#'
#' @param corpus A vector of text documents.
#' @param min.freq Minimum number of document in which a term need to be present
#' to be considered.
#' @param ids Identification ID of documents.
#' @param freq.subset.ids IDs to consider when computing term frequency.
#' @param included.pos Part of speech (POS) to consider when building the DTM.
#' See [lexicon::hash_grady_pos()] for a list of recognized POS.
#' @param tokenize.fun Function to use to clean up text.
#' @param add.ngrams Whether to search and add non-consecutive n-grams. See
#' [DTM.add_ngrams()].
#' @param n.gram.thresh The threshold to use to identify the network of
#' non-consecutive n-grams if \code{add.ngrams} is \code{TRUE}.
#' @param aggr.synonyms Whether to aggregate terms which almost always appear
#' together. See [DTM.aggr_synonyms()].
#' @param syn.thresh The threshold to use to identify the network of terms to
#' aggregate if \code{aggr.synonyms} is \code{TRUE}.
#' @param label A label to prepend to term columns in the DTM.
#' @param na.as.missing Whether to set as \code{NA} the DTM cells for empty
#' document. If \code{FALSE} those cells will be set to zero.
#'
#' @return A Document Term Matrix with a row for each document and a column for
#' the terms plus a column with the document IDs.
#'
#' @export
#'
#' @examples
#' \dontrun{
#'
#' Records <- import_data(get_session_files("Session1")$Records)
#'
#' Title_DTM <- with(
#' Records,
#' text_to_DTM(Title,
#' min.freq = 20, label = "TITLE__", ids = ID,
#' freq.subset.ids = ID[Target %in% c("y", "n")]
#' )
#' )
#' }
text_to_DTM <- function(corpus, min.freq = 20, ids = 1:length(corpus),
freq.subset.ids = ids,
included.pos = c("Noun", "Verb", "Adjective"), # TODO: add Plural and Noun Phrase
tokenize.fun = tokenize_text, add.ngrams = TRUE,
aggr.synonyms = TRUE, n.gram.thresh = .5,
syn.thresh = .9, label = "TERM__", na.as.missing = TRUE) {
# Silence CMD CHECK about non standard eval
pos <- word <- val <- term <- ID <- Freq <- NULL
raw.corpus <- corpus
order.ids <- 1:length(corpus)
names(ids) <- order.ids
if (is.na(min.freq)) stop('"min.freq" is NA.')
if (!is.null(tokenize.fun)) {
corpus <- tokenize.fun(corpus)
}
splitted.corpus <- corpus %>% stringr::str_split(" +")
if (length(splitted.corpus) != length(ids)) stop("Number of documents and ids are different")
excluded.pos <- lexicon::hash_grady_pos %>%
mutate(pos = stringr::str_remove_all(pos, " \\(.*")) %>%
filter(!(pos %in% included.pos) & !(word %in% word[pos %in% included.pos])) # Keep terms that are ONLY associated to non relevant parts of speech
message("- to long format...")
tictoc::tic()
corpus <- data.frame(
term = splitted.corpus %>% unlist(),
val = 1,
ID = rep(order.ids, splitted.corpus %>% sapply(length))
) %>%
na.omit() %>%
distinct() %>%
mutate(
val = replace(val, stringr::str_detect(term, "\\*"), 2),
term = stringr::str_remove(term, "\\*")
) %>%
filter(!(term %in% excluded.pos$word))
tictoc::toc()
message("- removing rare terms...")
frequent_terms <- corpus %>%
filter(ID %in% order.ids[ids %in% freq.subset.ids]) %>%
count(term, name = "Freq") %>% # count term frequency, but only in relevant IDs
filter(Freq >= min.freq) %>%
pull(term) # create frequent terms list
corpus <- corpus %>%
filter(term %in% frequent_terms) %>% # filter out unfrequent terms
arrange(ID, term, desc(val)) %>%
distinct(ID, term, .keep_all = TRUE) # Remove duplicate terms keeping the first of each occurrence (useful for Mesh data)
message("- to wide format...")
tictoc::tic()
DTM <- tidyr::pivot_wider(corpus,
id_cols = ID, names_from = term,
names_prefix = label, values_from = val,
values_fill = 0
)
tictoc::toc()
if (add.ngrams) {
message("- find non consecutive ngram...")
tictoc::tic()
DTM <- DTM.add_ngrams(DTM, min.sim = n.gram.thresh)
tictoc::toc()
}
if (aggr.synonyms) {
message("- find synonyms...")
tictoc::tic()
DTM <- DTM.aggr_synonyms(DTM, min.sim = syn.thresh)
tictoc::toc()
}
# The synonyms creation procedure can create very long names
DTM <- DTM %>% setNames(stringr::str_sub(colnames(DTM), 1, 10000))
message("- managing missings...")
if (nrow(DTM) < length(raw.corpus)) { # Add documents with no content, ie. NAs
missing_docs <- setdiff(order.ids, DTM$ID)
DTM <- bind_rows(
DTM,
DTM[rep(1, length(missing_docs)), ] %>% # add the missing documents using the first DTM row as template
mutate(ID = missing_docs, across(c(-ID), ~0)) # set the term score to zero for all added documents
) %>% arrange(ID)
}
if (na.as.missing) { # Put NA to terms for documents with no content, otherwise leave zero
DTM <- DTM %>% mutate(across(-ID, ~ replace(.x, is.na(raw.corpus), NA)))
}
DTM %>% mutate(ID = ids[ID])
}
#' Find non-consecutive n-grams
#'
#' Build a term-term network using a cosine similarity measure built on the term
#' co-presence in documents. A threshold defined in \code{min.sim} is used to
#' identify edges. The maximal cliques of the network represent the discovered
#' n-grams.
#'
#' @param DTM A Document Term Matrix.
#' @param min.sim The minimal cosine similarity that identifies an edge.
#' @param max.terms The maximum size (i.e., the number of terms) in an n-gram.
#'
#' @return The same input Document Term Matrix with extra columns for the
#' n-grams.
#'
DTM.add_ngrams <- function(DTM, min.sim = .5, max.terms = 10) {
mat <- as.matrix(DTM[, -1])
mat.sparse <- Matrix::Matrix(mat, sparse = TRUE) # Using sparse matrices
TTM <- qlcMatrix::cosSparse(mat.sparse) # Cosine similarity
ngram.cliques <- igraph::graph_from_adjacency_matrix(as.matrix(TTM) >= min.sim, mode = "undirected", diag = FALSE) %>% # From TTM to undirected network
igraph::max_cliques(min = 2) %>% lapply(names) # Extracting cliques
if (length(ngram.cliques) > 0) {
new_cols <- lapply(ngram.cliques, function(clique) { # For each clique
if (length(clique) <= max.terms) {
new_col <- data.frame(
as.numeric(DTM[, clique] %>% rowSums() == length(clique)) # TRUE if all words are present in the doc
)
colnames(new_col) <- paste0(stringr::str_extract(clique[1], "\\w+__"), stringr::str_remove(clique, "\\w+__") %>% paste(collapse = "._."))
new_col
}
}) %>% bind_cols()
DTM <- DTM %>% bind_cols(new_cols) # Add joined terms
}
}
#' Aggregate redundant terms
#'
#' Build a term-term network using a cosine similarity measure built on the term
#' co-presence in documents. A high threshold defined in \code{min.sim} is used
#' to identify edges. The high edge threshold splits the network into multiple
#' components which identify redundant terms.
#'
#' @param DTM A Document Term Matrix.
#' @param min.sim The minimal cosine similarity that identifies an edge.
#'
#' @return The same input Document Term Matrix with redundant terms removed and
#' joined into new columns.
#'
DTM.aggr_synonyms <- function(DTM, min.sim = .9) {
mat <- as.matrix(DTM[, -1])
mat.sparse <- Matrix::Matrix(mat, sparse = TRUE) # Using sparse matrices
TTM <- qlcMatrix::cosSparse(mat.sparse) # Cosine similarity
syn.components <- igraph::graph_from_adjacency_matrix(as.matrix(TTM) >= min.sim, mode = "undirected", diag = FALSE) %>% # From TTM to undirected network
igraph::components() # Extracting connected subgraphs
syn.components <- lapply(which(syn.components$csize > 1), function(i) {
names(syn.components$membership)[syn.components$membership == i]
})
if (length(syn.components) > 0) {
new_cols <- lapply(syn.components, function(component) { # For each component
new_col <- data.frame(
as.numeric(DTM[, component] %>% rowSums() > 0) # TRUE if at least one word is in the doc
)
colnames(new_col) <- paste0(stringr::str_extract(component[1], "\\w+__"), stringr::str_remove(component, "\\w+__") %>% paste(collapse = "."))
new_col
}) %>% bind_cols()
DTM <- DTM %>% select(-all_of(syn.components %>% unlist())) %>% # Remove single terms
bind_cols(new_cols) # Add joined terms
} else {
DTM
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.