Nothing
#' @title Aggregate a data.frame to the document/term level by calculating how many times a term occurs per document
#' @description Aggregate a data.frame to the document/term level by calculating how many times a term occurs per document
#' @param x a data.frame or data.table containing a field which can be considered
#' as a document (defaults to the first column in \code{x}) and a field which can be considered as a term
#' (defaults to the second column in \code{x}). If the dataset also contains a column called 'freq', this will be summed over instead of counting the number
#' of rows occur by document/term combination.\cr
#' If \code{x} is a character vector containing several terms, the text will be split by the argument \code{split}
#' before doing the agregation at the document/term level.
#' @param document If \code{x} is a data.frame, the column in \code{x} which identifies a document. If \code{x}
#' is a character vector then \code{document} is a vector of the same length as \code{x} where \code{document[i]} is the
#' document id which corresponds to the text in \code{x[i]}.
#' @param term If \code{x} is a data.frame, the column in \code{x} which identifies a term. Defaults to the second column
#' in \code{x}.
#' @param ... further arguments passed on to the methods
#' @return a data.table with columns doc_id, term, freq indicating how many times a term occurred in each document.
#' If freq occurred in the input dataset the resulting data will have summed the freq. If freq is not in the dataset,
#' will assume that freq is 1 for each row in the input dataset \code{x}.
#' @export
#' @examples
#' \dontshow{
#' data.table::setDTthreads(1)
#' }
#' ##
#' ## Calculate document_term_frequencies on a data.frame
#' ##
#' data(brussels_reviews_anno)
#' \dontshow{
#' brussels_reviews_anno <- subset(brussels_reviews_anno, language %in% "nl")
#' }
#' x <- document_term_frequencies(brussels_reviews_anno[, c("doc_id", "token")])
#' x <- document_term_frequencies(brussels_reviews_anno[, c("doc_id", "lemma")])
#' str(x)
#'
#' brussels_reviews_anno$my_doc_id <- paste(brussels_reviews_anno$doc_id,
#' brussels_reviews_anno$sentence_id)
#' x <- document_term_frequencies(brussels_reviews_anno[, c("my_doc_id", "lemma")])
#'
#' ##
#' ## Calculate document_term_frequencies on a character vector
#' ##
#' data(brussels_reviews)
#' x <- document_term_frequencies(x = brussels_reviews$feedback, document = brussels_reviews$id,
#' split = " ")
#' x <- document_term_frequencies(x = brussels_reviews$feedback, document = brussels_reviews$id,
#' split = "[[:space:][:punct:][:digit:]]+")
#'
#' ##
#' ## document-term-frequencies on several fields to easily include bigram and trigrams
#' ##
#' library(data.table)
#' x <- as.data.table(brussels_reviews_anno)
#' x <- x[, token_bigram := txt_nextgram(token, n = 2), by = list(doc_id, sentence_id)]
#' x <- x[, token_trigram := txt_nextgram(token, n = 3), by = list(doc_id, sentence_id)]
#' x <- document_term_frequencies(x = x,
#' document = "doc_id",
#' term = c("token", "token_bigram", "token_trigram"))
#' head(x)
document_term_frequencies <- function(x, document, ...){
UseMethod("document_term_frequencies")
}
#' @describeIn document_term_frequencies Create a data.frame with one row per document/term combination indicating the frequency of the term in the document
#' @export
document_term_frequencies.data.frame <- function(x, document = colnames(x)[1], term = colnames(x)[2], ...){
result <- list()
for(field in term){
result[[field]] <- dtf(x, document, term = field, ...)
}
result <- data.table::rbindlist(result)
result <- dtf(result)
result
}
dtf <- function(x, document = colnames(x)[1], term = colnames(x)[2], ...){
# r cmd check happiness
freq <- .N <- NULL
x <- setDT(x)
if("freq" %in% colnames(x)){
result <- x[, list(freq = as.integer(sum(freq))), by = c(document, term)]
}else{
result <- x[, list(freq = as.integer(.N)), by = c(document, term)]
}
if(!"term" %in% colnames(result)){
setnames(result, old = colnames(result)[2], new = c("term"))
}
if(!"doc_id" %in% colnames(result)){
setnames(result, old = colnames(result)[1], new = c("doc_id"))
}
result<- result[!is.na(term), ]
result
}
#' @describeIn document_term_frequencies Create a data.frame with one row per document/term combination indicating the frequency of the term in the document
#' @param split The regular expression to be used if \code{x} is a character vector.
#' This will split the character vector \code{x} in pieces by the provides split argument.
#' Defaults to splitting according to spaces/punctuations/digits.
#' @export
document_term_frequencies.character <- function(x, document=paste("doc", seq_along(x), sep=""), split = "[[:space:][:punct:][:digit:]]+", ...){
txt <- NULL
dt <- data.table(document = as.character(document), txt = x)
dt <- dt[!is.na(txt), list(term = unlist(strsplit(txt, split = split))), by = list(document)]
dt <- document_term_frequencies(dt)
dt
}
#' @title Add Term Frequency, Inverse Document Frequency and Okapi BM25 statistics to the output of document_term_frequencies
#' @description Term frequency Inverse Document Frequency (tfidf) is calculated as the multiplication of
#' \itemize{
#' \item Term Frequency (tf): how many times the word occurs in the document / how many words are in the document
#' \item Inverse Document Frequency (idf): log(number of documents / number of documents where the term appears)
#' }
#' The Okapi BM25 statistic is calculated as the multiplication of the inverse document frequency
#' and the weighted term frequency as defined at \url{https://en.wikipedia.org/wiki/Okapi_BM25}.
#' @param x a data.table as returned by \code{document_term_frequencies} containing the columns doc_id, term and freq.
#' @param k parameter k1 of the Okapi BM25 ranking function as defined at \url{https://en.wikipedia.org/wiki/Okapi_BM25}. Defaults to 1.2.
#' @param b parameter b of the Okapi BM25 ranking function as defined at \url{https://en.wikipedia.org/wiki/Okapi_BM25}. Defaults to 0.5.
#' @return a data.table with columns doc_id, term, freq and added to that the computed statistics
#' tf, idf, tfidf, tf_bm25 and bm25.
#' @export
#' @examples
#' data(brussels_reviews_anno)
#' \dontshow{
#' data.table::setDTthreads(1)
#' brussels_reviews_anno <- subset(brussels_reviews_anno, language %in% "nl")
#' }
#' x <- document_term_frequencies(brussels_reviews_anno[, c("doc_id", "token")])
#' x <- document_term_frequencies_statistics(x)
#' head(x)
document_term_frequencies_statistics <- function(x, k = 1.2, b = 0.75){
## r cmd check happiness
doc_id <- term <- freq <- tf <- idf <- tf_idf <- tf_bm25 <- bm25 <- NULL
##
## Term Frequency Inverse Document Frequency
## - tf: how many times the word occurs in the document / how many words are in the document
## - idf: number of documents / number of documents where the term appears
##
nr_docs <- length(unique(x$doc_id))
x <- x[, tf := freq / sum(freq), by = list(doc_id)]
x <- x[, idf := log(nr_docs / uniqueN(doc_id)), by = list(term)]
x <- x[, tf_idf := tf * idf, ]
##
## Okapi BM25
## - See https://en.wikipedia.org/wiki/Okapi_BM25
##
average_doc_length <- x[, list(nr_terms = sum(freq)), by = list(doc_id)]
average_doc_length <- mean(average_doc_length$nr_terms)
x <- x[, tf_bm25 := (freq * (k + 1)) / (freq + (k * (1 - b + b * (sum(freq) / average_doc_length)))), by = list(doc_id)]
x <- x[, bm25 := tf_bm25 * idf, ]
x
}
#' @title Create a document/term matrix
#' @description Create a document/term matrix from either
#' \itemize{
#' \item a data.frame with 1 row per document/term as returned by \code{\link{document_term_frequencies}}
#' \item a list of tokens from e.g. from package sentencepiece, tokenizers.bpe or just by using strsplit
#' \item an object of class DocumentTermMatrix or TermDocumentMatrix from the tm package
#' \item an object of class simple_triplet_matrix from the slam package
#' \item a regular dense matrix
#' }
#' @param x a data.frame with columns doc_id, term and freq indicating how many times a term occurred in that specific document. This is what \code{\link{document_term_frequencies}} returns.\cr
#' This data.frame will be reshaped to a matrix with 1 row per doc_id, the terms will be put
#' in the columns and the freq in the matrix cells. Note that the column name to use for freq can be set in the \code{weight} argument.
#' @param vocabulary a character vector of terms which should be present in the document term matrix even if they did not occur in \code{x}
#' @param weight a column of \code{x} indicating what to put in the matrix cells. Defaults to 'freq' indicating to use column \code{freq} from \code{x} to put into the matrix cells
#' @param ... further arguments currently not used
#' @return an sparse object of class dgCMatrix with in the rows the documents and in the columns the terms containing the frequencies
#' provided in \code{x} extended with terms which were not in \code{x} but were provided in \code{vocabulary}.
#' The rownames of this resulting object contain the doc_id from \code{x}
#' @export
#' @seealso \code{\link[Matrix]{sparseMatrix}}, \code{\link{document_term_frequencies}}
#' @examples
#' \dontshow{
#' data.table::setDTthreads(1)
#' }
#' x <- data.frame(doc_id = c(1, 1, 2, 3, 4),
#' term = c("A", "C", "Z", "X", "G"),
#' freq = c(1, 5, 7, 10, 0))
#' document_term_matrix(x)
#' document_term_matrix(x, vocabulary = LETTERS)
#'
#' ## Example on larger dataset
#' data(brussels_reviews_anno)
#' \dontshow{
#' brussels_reviews_anno <- subset(brussels_reviews_anno, language %in% "nl")
#' }
#' x <- document_term_frequencies(brussels_reviews_anno[, c("doc_id", "lemma")])
#' dtm <- document_term_matrix(x)
#' dim(dtm)
#' x <- document_term_frequencies(brussels_reviews_anno[, c("doc_id", "lemma")])
#' x <- document_term_frequencies_statistics(x)
#' dtm <- document_term_matrix(x)
#' dtm <- document_term_matrix(x, weight = "freq")
#' dtm <- document_term_matrix(x, weight = "tf_idf")
#' dtm <- document_term_matrix(x, weight = "bm25")
#' x <- split(brussels_reviews_anno$lemma, brussels_reviews_anno$doc_id)
#' dtm <- document_term_matrix(x)
#' ## example showing the vocubulary argument
#' ## allowing you to making sure terms which are not in the data are provided in the resulting dtm
#' allterms <- unique(x$term)
#' dtm <- document_term_matrix(head(x, 1000), vocabulary = allterms)
#'
#' ## example for a list of tokens
#' x <- list(doc1 = c("aa", "bb", "cc", "aa", "b"),
#' doc2 = c("bb", "bb", "dd", ""),
#' doc3 = character(),
#' doc4 = c("cc", NA),
#' doc5 = character())
#' document_term_matrix(x)
#' dtm <- document_term_matrix(x, vocabulary = c("a", "bb", "cc"))
#' dtm <- dtm_conform(dtm, rows = c("doc1", "doc2", "doc7"), columns = c("a", "bb", "cc"))
#' data(brussels_reviews)
#' x <- strsplit(setNames(brussels_reviews$feedback, brussels_reviews$id), split = " +")
#' x <- document_term_matrix(x)
#'
#'
#' ##
#' ## Example adding bigrams/trigrams to the document term matrix
#' ## Mark that this can also be done using ?dtm_cbind
#' ##
#' library(data.table)
#' x <- as.data.table(brussels_reviews_anno)
#' x <- x[, token_bigram := txt_nextgram(token, n = 2), by = list(doc_id, sentence_id)]
#' x <- x[, token_trigram := txt_nextgram(token, n = 3), by = list(doc_id, sentence_id)]
#' x <- document_term_frequencies(x = x,
#' document = "doc_id",
#' term = c("token", "token_bigram", "token_trigram"))
#' dtm <- document_term_matrix(x)
#'
#' ##
#' ## Convert dense matrix to sparse matrix
#' ##
#' x <- matrix(c(0, 0, 0, 1, NA, 3, 4, 5, 6, 7), nrow = 2)
#' x
#' dtm <- document_term_matrix(x)
#' dtm
#' x <- matrix(c(0, 0, 0, 0.1, NA, 0.3, 0.4, 0.5, 0.6, 0.7), nrow = 2)
#' x
#' dtm <- document_term_matrix(x)
#' dtm
#' x <- setNames(c(TRUE, NA, FALSE, FALSE), c("a", "b", "c", "d"))
#' x <- as.matrix(x)
#' dtm <- document_term_matrix(x)
#' dtm
#'
#' ##
#' ## Convert vectors to sparse matrices
#' ##
#' x <- setNames(-3:3, c("a", "b", "c", "d", "e", "f"))
#' dtm <- document_term_matrix(x)
#' dtm
#' x <- setNames(runif(6), c("a", "b", "c", "d", "e", "f"))
#' dtm <- document_term_matrix(x)
#' dtm
#'
#' ##
#' ## Convert lists to sparse matrices
#' ##
#' x <- list(a = c("some", "set", "of", "words"),
#' b1 = NA,
#' b2 = NA,
#' c1 = character(),
#' c2 = 0,
#' d = c("words", "words", "words"))
#' dtm <- document_term_matrix(x)
#' dtm
document_term_matrix <- function(x, vocabulary, weight = "freq", ...){
UseMethod("document_term_matrix")
}
#' @describeIn document_term_matrix Construct a document term matrix from a data.frame with columns doc_id, term, freq
#' @export
document_term_matrix.data.frame <- function(x, vocabulary, weight = "freq", ...){
stopifnot(all(c("doc_id", "term", weight) %in% colnames(x)))
#stopifnot(ncol(x) == 3)
x$document <- as.character(x$doc_id)
x$document <- factor(x$document, levels = setdiff(unique(x$document), NA))
term <- as.character(x$term)
if(!missing(vocabulary)){
x$term <- factor(term, levels = setdiff(unique(c(vocabulary, levels(factor(term)))), NA))
}else{
x$term <- factor(term)
}
doclabels <- levels(x$document)
termlabels <- levels(x$term)
x$document <- as.integer(x$document)
x$term <- as.integer(x$term)
dtm <- sparseMatrix(i=x$document, j = x$term, x = x[[weight]], dims = c(length(doclabels), length(termlabels)),
dimnames = list(doclabels, termlabels))
dtm
}
#' @describeIn document_term_matrix Construct a sparse document term matrix from a matrix
#' @export
document_term_matrix.matrix <- function(x, ...){
x <- as(x, "dgCMatrix")
#x <- as(as(as(x, "dMatrix"), "generalMatrix"), "CsparseMatrix")
#x <- Matrix(data = x, sparse = TRUE, doDiag = FALSE, dimnames = dimnames(x))
#if(is.logical(x)){
# x[] <- as.integer(x)
#}
#x <- as(x, "CsparseMatrix")
x
}
#' @describeIn document_term_matrix Construct a sparse document term matrix from an named integer vector
#' @export
document_term_matrix.integer <- function(x, ...){
if(anyDuplicated(names(x))){
stop("x has duplicate names")
}
x <- as.matrix(x)
x <- document_term_matrix.matrix(x)
x
}
#' @describeIn document_term_matrix Construct a sparse document term matrix from a named numeric vector
#' @export
document_term_matrix.numeric <- function(x, ...){
if(anyDuplicated(names(x))){
stop("x has duplicate names")
}
x <- as.matrix(x)
x <- document_term_matrix.matrix(x)
x
}
#' @describeIn document_term_matrix Construct a document term matrix from a list of tokens
#' @export
document_term_matrix.default <- function(x, vocabulary, ...){
stopifnot(is.list(x))
.N <- document <- term <- NULL
docs <- names(x)
if(is.null(docs)){
docs <- seq_along(x)
}else{
if(anyDuplicated(docs)){
stop("x has duplicate names")
}
}
x <- list(document = rep(x = docs, times = sapply(x, length)),
term = unlist(x, use.names = FALSE))
x$document <- factor(x$document, levels = docs)
if(!missing(vocabulary)){
x$term <- factor(x$term, levels = setdiff(unique(c(vocabulary, levels(factor(x$term)))), NA))
}else{
x$term <- factor(x$term)
}
x <- data.table::setDT(x)
x <- x[!is.na(term), list(freq = .N), by = list(document, term)]
doclabels <- levels(x$document)
termlabels <- levels(x$term)
x$document <- as.integer(x$document)
x$term <- as.integer(x$term)
dtm <- Matrix::sparseMatrix(i=x$document, j = x$term, x = x$freq,
dims = c(length(doclabels), length(termlabels)),
dimnames = list(doclabels, termlabels))
dtm
}
#' @describeIn document_term_matrix Convert an object of class \code{DocumentTermMatrix} from the tm package to a sparseMatrix
#' @export
document_term_matrix.DocumentTermMatrix <- function(x, ...){
dtm <- Matrix::sparseMatrix(i=x$i, j = x$j, x = x$v,
dims = c(x$nrow, x$ncol),
dimnames = x$dimnames)
dtm
}
#' @describeIn document_term_matrix Convert an object of class \code{TermDocumentMatrix} from the tm package to a sparseMatrix with
#' the documents in the rows and the terms in the columns
#' @export
document_term_matrix.TermDocumentMatrix <- function(x, ...){
dtm <- document_term_matrix.DocumentTermMatrix(x)
dtm <- Matrix::t(dtm)
dtm
}
#' @describeIn document_term_matrix Convert an object of class \code{simple_triplet_matrix} from the slam package to a sparseMatrix
#' @export
document_term_matrix.simple_triplet_matrix <- function(x, ...){
dtm <- Matrix::sparseMatrix(i=x$i, j = x$j, x = x$v,
dims = c(x$nrow, x$ncol),
dimnames = x$dimnames)
dtm
}
#' @title Inverse operation of the document_term_matrix function
#' @description Inverse operation of the \code{\link{document_term_matrix}} function.
#' Creates frequency table which contains 1 row per document/term
#' @param x an object as returned by \code{\link{document_term_matrix}}
#' @return a data.frame with columns doc_id, term and freq where freq is just the value in each
#' cell of the \code{x}
#' @export
#' @seealso \code{\link{document_term_matrix}}
#' @examples
#' x <- data.frame(
#' doc_id = c(1, 1, 2, 3, 4),
#' term = c("A", "C", "Z", "X", "G"),
#' freq = c(1, 5, 7, 10, 0))
#' dtm <- document_term_matrix(x)
#' dtm_reverse(dtm)
dtm_reverse <- function(x){
m <- Matrix::summary(x)
data.frame(doc_id = rownames(x)[m$i],
term = colnames(x)[m$j],
freq = m$x,
stringsAsFactors = FALSE)
}
#' @title Remove terms occurring with low frequency from a Document-Term-Matrix and documents with no terms
#' @description Remove terms occurring with low frequency from a Document-Term-Matrix and documents with no terms
#' @param dtm an object returned by \code{\link{document_term_matrix}}
#' @param minfreq integer with the minimum number of times the term should occur in order to keep the term
#' @param maxterms integer indicating the maximum number of terms which should be kept in the \code{dtm}. The argument is optional.
#' @param remove_emptydocs logical indicating to remove documents containing no more terms after the term removal is executed. Defaults to \code{TRUE}.
#' @return a sparse Matrix as returned by \code{sparseMatrix}
#' where terms with low occurrence are removed and documents without any terms are also removed
#' @export
#' @examples
#' data(brussels_reviews_anno)
#' x <- subset(brussels_reviews_anno, xpos == "NN")
#' x <- x[, c("doc_id", "lemma")]
#' x <- document_term_frequencies(x)
#' dtm <- document_term_matrix(x)
#'
#'
#' ## Remove terms with low frequencies and documents with no terms
#' x <- dtm_remove_lowfreq(dtm, minfreq = 10)
#' dim(x)
#' x <- dtm_remove_lowfreq(dtm, minfreq = 10, maxterms = 25)
#' dim(x)
#' x <- dtm_remove_lowfreq(dtm, minfreq = 10, maxterms = 25, remove_emptydocs = FALSE)
#' dim(x)
dtm_remove_lowfreq <- function(dtm, minfreq=5, maxterms, remove_emptydocs = TRUE){
dtm <- dtm[, which(Matrix::colSums(dtm) >= minfreq), drop = FALSE]
if(!missing(maxterms)){
terms <- head(sort(Matrix::colSums(dtm), decreasing = TRUE), n = maxterms)
dtm <- dtm[, which(colnames(dtm) %in% names(terms)), drop = FALSE]
}
if(remove_emptydocs){
dtm <- dtm[Matrix::rowSums(dtm) > 0, , drop = FALSE]
}
dtm
}
#' @title Remove terms with high sparsity from a Document-Term-Matrix
#' @description Remove terms with high sparsity from a Document-Term-Matrix and remove documents with no terms.\cr
#' Sparsity indicates in how many documents the term is not occurring.
#' @param dtm an object returned by \code{\link{document_term_matrix}}
#' @param sparsity numeric in 0-1 range indicating the sparsity percent. Defaults to 0.99 meaning drop terms which occur in less than 1 percent of the documents.
#' @param remove_emptydocs logical indicating to remove documents containing no more terms after the term removal is executed. Defaults to \code{TRUE}.
#' @return a sparse Matrix as returned by \code{sparseMatrix}
#' where terms with high sparsity are removed and documents without any terms are also removed
#' @export
#' @examples
#' data(brussels_reviews_anno)
#' x <- subset(brussels_reviews_anno, xpos == "NN")
#' x <- x[, c("doc_id", "lemma")]
#' x <- document_term_frequencies(x)
#' dtm <- document_term_matrix(x)
#'
#'
#' ## Remove terms with low frequencies and documents with no terms
#' x <- dtm_remove_sparseterms(dtm, sparsity = 0.99)
#' dim(x)
#' x <- dtm_remove_sparseterms(dtm, sparsity = 0.99, remove_emptydocs = FALSE)
#' dim(x)
dtm_remove_sparseterms <- function(dtm, sparsity = 0.99, remove_emptydocs = TRUE){
colfreq <- diff(dtm@p)
sparseness <- 1 - (colfreq / nrow(dtm))
keep <- which(sparseness < sparsity)
if(length(keep) == 0){
warning(sprintf("No terms which occur more than %s percent of the documents", 100*(1-sparsity)))
if(remove_emptydocs){
dtm <- dtm[0, 0, drop = FALSE]
}else{
dtm <- dtm[, 0, drop = FALSE]
}
}else{
dtm <- dtm[, keep, drop = FALSE]
if(remove_emptydocs){
dtm <- dtm[Matrix::rowSums(dtm) > 0, , drop = FALSE]
}
}
dtm
}
#' @title Term Frequency - Inverse Document Frequency calculation
#' @description Term Frequency - Inverse Document Frequency calculation.
#' Averaged by each term.
#' @param dtm an object returned by \code{\link{document_term_matrix}}
#' @return a vector with tfidf values, one for each term in the \code{dtm} matrix
#' @export
#' @examples
#' data(brussels_reviews_anno)
#' x <- subset(brussels_reviews_anno, xpos == "NN")
#' x <- x[, c("doc_id", "lemma")]
#' x <- document_term_frequencies(x)
#' dtm <- document_term_matrix(x)
#'
#' ## Calculate tfidf
#' tfidf <- dtm_tfidf(dtm)
#' hist(tfidf, breaks = "scott")
#' head(sort(tfidf, decreasing = TRUE))
#' head(sort(tfidf, decreasing = FALSE))
dtm_tfidf <- function(dtm){
## number of times word appears / number of words in document, on average if non-missing
## times log2(# documents / #documentsXwords)
terms <- colnames(dtm)
m <- Matrix::summary(dtm)
term_tfidf <- tapply(m$x/Matrix::rowSums(dtm)[m$i], m$j, mean) *
log2(nrow(dtm)/Matrix::colSums(dtm > 0))
names(term_tfidf) <- terms
term_tfidf
}
#' @title Remove terms from a Document-Term-Matrix and documents with no terms based on the term frequency inverse document frequency
#' @description Remove terms from a Document-Term-Matrix and documents with no terms based on the term frequency inverse document frequency.
#' Either giving in the maximum number of terms (argument \code{top}), the tfidf cutoff (argument \code{cutoff})
#' or a quantile (argument \code{prob})
#' @param dtm an object returned by \code{\link{document_term_matrix}}
#' @param top integer with the number of terms which should be kept as defined by the highest mean tfidf
#' @param cutoff numeric cutoff value to keep only terms in \code{dtm} where the tfidf obtained by \code{dtm_tfidf} is higher than this value
#' @param prob numeric quantile indicating to keep only terms in \code{dtm} where the tfidf obtained by \code{dtm_tfidf} is higher than
#' the \code{prob} percent quantile
#' @param remove_emptydocs logical indicating to remove documents containing no more terms after the term removal is executed. Defaults to \code{TRUE}.
#' @return a sparse Matrix as returned by \code{sparseMatrix}
#' where terms with high tfidf are kept and documents without any remaining terms are removed
#' @export
#' @examples
#' data(brussels_reviews_anno)
#' x <- subset(brussels_reviews_anno, xpos == "NN")
#' x <- x[, c("doc_id", "lemma")]
#' x <- document_term_frequencies(x)
#' dtm <- document_term_matrix(x)
#' dtm <- dtm_remove_lowfreq(dtm, minfreq = 10)
#' dim(dtm)
#'
#' ## Keep only terms with high tfidf
#' x <- dtm_remove_tfidf(dtm, top=50)
#' dim(x)
#' x <- dtm_remove_tfidf(dtm, top=50, remove_emptydocs = FALSE)
#' dim(x)
#'
#' ## Keep only terms with tfidf above 1.1
#' x <- dtm_remove_tfidf(dtm, cutoff=1.1)
#' dim(x)
#'
#' ## Keep only terms with tfidf above the 60 percent quantile
#' x <- dtm_remove_tfidf(dtm, prob=0.6)
#' dim(x)
dtm_remove_tfidf <- function(dtm, top, cutoff, prob, remove_emptydocs = TRUE){
tfidf <- dtm_tfidf(dtm)
if(!missing(top)){
terms <- head(sort(tfidf, decreasing = TRUE), n = top)
terms <- names(terms)
}else if(!missing(cutoff)){
terms <- tfidf[tfidf >= cutoff]
terms <- names(terms)
}else if(!missing(prob)){
cutoff <- stats::quantile(tfidf, prob)
terms <- tfidf[tfidf >= cutoff]
terms <- names(terms)
}else{
stop("either provide top, cutoff or prob")
}
if(length(terms) == 0){
stop("no terms found in reducing based on tfidf, consider increasing top or decreasing cutoff/prob")
}
dtm <- dtm[, which(colnames(dtm) %in% terms), drop = FALSE]
if(remove_emptydocs){
dtm <- dtm[Matrix::rowSums(dtm) > 0, , drop = FALSE]
}
dtm
}
#' @title Remove terms from a Document-Term-Matrix and keep only documents which have a least some terms
#' @description Remove terms from a Document-Term-Matrix and keep only documents which have a least some terms
#' @param dtm an object returned by \code{\link{document_term_matrix}}
#' @param terms a character vector of terms which are in \code{colnames(dtm)} and which should be removed
#' @param remove_emptydocs logical indicating to remove documents containing no more terms after the term removal is executed. Defaults to \code{TRUE}.
#' @return a sparse Matrix as returned by \code{sparseMatrix}
#' where the indicated terms are removed as well as documents with no terms whatsoever
#' @export
#' @examples
#' data(brussels_reviews_anno)
#' x <- subset(brussels_reviews_anno, xpos == "NN")
#' x <- x[, c("doc_id", "lemma")]
#' x <- document_term_frequencies(x)
#' dtm <- document_term_matrix(x)
#' dim(dtm)
#' x <- dtm_remove_terms(dtm, terms = c("appartement", "casa", "centrum", "ciudad"))
#' dim(x)
#' x <- dtm_remove_terms(dtm, terms = c("appartement", "casa", "centrum", "ciudad"),
#' remove_emptydocs = FALSE)
#' dim(x)
dtm_remove_terms <- function(dtm, terms, remove_emptydocs = TRUE){
dtm <- dtm[, which(!colnames(dtm) %in% terms), drop = FALSE]
if(remove_emptydocs){
dtm <- dtm[Matrix::rowSums(dtm) > 0, , drop = FALSE]
}
dtm
}
#' @title Pearson Correlation for Sparse Matrices
#' @description Pearson Correlation for Sparse Matrices.
#' More memory and time-efficient than \code{cor(as.matrix(x))}.
#' @param x A matrix, potentially a sparse matrix such as a "dgCMatrix" object
#' which is returned by \code{\link{document_term_matrix}}
#' @return a correlation matrix
#' @seealso \code{\link{document_term_matrix}}
#' @export
#' @examples
#' x <- data.frame(
#' doc_id = c(1, 1, 2, 3, 4),
#' term = c("A", "C", "Z", "X", "G"),
#' freq = c(1, 5, 7, 10, 0))
#' dtm <- document_term_matrix(x)
#' dtm_cor(dtm)
dtm_cor <- function(x) {
n <- nrow(x)
covmat <- (as.matrix(Matrix::crossprod(x)) - n * Matrix::tcrossprod(Matrix::colMeans(x))) / (n - 1)
cormat <- covmat / Matrix::tcrossprod(sqrt(Matrix::diag(covmat)))
cormat
}
#' @title Combine 2 document term matrices either by rows or by columns
#' @description These 2 methods provide \code{\link{cbind}} and \code{\link{rbind}} functionality
#' for sparse matrix objects which are returned by \code{\link{document_term_matrix}}. \cr
#'
#' In case of \code{dtm_cbind}, if the rows are not ordered in the same way in x and y, it will order them based on the rownames.
#' If there are missing rows these will be filled with NA values. \cr
#' In case of \code{dtm_rbind}, if the columns are not ordered in the same way in x and y, it will order them based on the colnames.
#' If there are missing columns these will be filled with NA values.
#' @param x a sparse matrix such as a "dgCMatrix" object which is returned by \code{\link{document_term_matrix}}
#' @param y a sparse matrix such as a "dgCMatrix" object which is returned by \code{\link{document_term_matrix}}
#' @param ... more sparse matrices
#' @return a sparse matrix where either rows are put below each other in case of \code{dtm_rbind}
#' or columns are put next to each other in case of \code{dtm_cbind}
#' @seealso \code{\link{document_term_matrix}}
#' @name dtm_bind
#' @aliases dtm_rbind dtm_cbind
#' @export
#' @examples
#' \dontshow{
#' data.table::setDTthreads(1)
#' }
#' data(brussels_reviews_anno)
#' x <- brussels_reviews_anno
#'
#' ## rbind
#' dtm1 <- document_term_frequencies(x = subset(x, doc_id %in% c("10049756", "10284782")),
#' document = "doc_id", term = "token")
#' dtm1 <- document_term_matrix(dtm1)
#' dtm2 <- document_term_frequencies(x = subset(x, doc_id %in% c("10789408", "12285061", "35509091")),
#' document = "doc_id", term = "token")
#' dtm2 <- document_term_matrix(dtm2)
#' dtm3 <- document_term_frequencies(x = subset(x, doc_id %in% c("31133394", "36224131")),
#' document = "doc_id", term = "token")
#' dtm3 <- document_term_matrix(dtm3)
#' m <- dtm_rbind(dtm1, dtm2)
#' dim(m)
#' m <- dtm_rbind(dtm1, dtm2, dtm3)
#' dim(m)
#'
#' ## cbind
#' library(data.table)
#' x <- subset(brussels_reviews_anno, language %in% c("nl", "fr"))
#' x <- as.data.table(x)
#' x <- x[, token_bigram := txt_nextgram(token, n = 2), by = list(doc_id, sentence_id)]
#' x <- x[, lemma_upos := sprintf("%s//%s", lemma, upos)]
#' dtm1 <- document_term_frequencies(x = x, document = "doc_id", term = c("token"))
#' dtm1 <- document_term_matrix(dtm1)
#' dtm2 <- document_term_frequencies(x = x, document = "doc_id", term = c("token_bigram"))
#' dtm2 <- document_term_matrix(dtm2)
#' dtm3 <- document_term_frequencies(x = x, document = "doc_id", term = c("upos"))
#' dtm3 <- document_term_matrix(dtm3)
#' dtm4 <- document_term_frequencies(x = x, document = "doc_id", term = c("lemma_upos"))
#' dtm4 <- document_term_matrix(dtm4)
#' m <- dtm_cbind(dtm1, dtm2)
#' dim(m)
#' m <- dtm_cbind(dtm1, dtm2, dtm3, dtm4)
#' dim(m)
#' m <- dtm_cbind(dtm1[-c(100, 999), ], dtm2[-1000,])
#' dim(m)
dtm_cbind <- function(x, y, ...){
ldots <- list(...)
if(is.null(rownames(x))) stop("x needs to have rownames")
if(is.null(rownames(y))) stop("y needs to have rownames")
if(length(intersect(colnames(x), colnames(y))) > 0) stop("x and y should not have overlapping column names")
rowsleft <- rownames(x)
rowsright <- rownames(y)
r <- union(rowsleft, rowsright)
addleft <- setdiff(rowsright, rowsleft)
if(length(addleft) > 0){
addleft <- Matrix::sparseMatrix(i = integer(), j = integer(), x = NA,
dims = c(length(addleft), ncol(x)), dimnames = list(addleft, colnames(x)))
x <- methods::rbind2(x, addleft)
}
addright <- setdiff(rowsleft, rowsright)
if(length(addright) > 0){
addright <- Matrix::sparseMatrix(i = integer(), j = integer(), x = NA,
dims = c(length(addright), ncol(y)), dimnames = list(addright, colnames(y)))
y <- methods::rbind2(y, addright)
}
out <- cbind2(x[r, , drop = FALSE], y[r, , drop = FALSE])
if(length(ldots) > 0){
largs <- ldots[-1]
largs$x <- out
largs$y <- ldots[[1]]
out <- do.call(dtm_cbind, largs)
}
out
}
#' @export
#' @rdname dtm_bind
dtm_rbind <- function(x, y, ...){
ldots <- list(...)
if(is.null(colnames(x))) stop("x needs to have colnames")
if(is.null(colnames(y))) stop("y needs to have colnames")
if(length(intersect(rownames(x), rownames(y))) > 0) stop("x and y should not have overlapping row names")
colsleft <- colnames(x)
colsright <- colnames(y)
r <- union(colsleft, colsright)
addleft <- setdiff(colsright, colsleft)
if(length(addleft) > 0){
addleft <- Matrix::sparseMatrix(i = integer(), j = integer(), x = NA,
dims = c(nrow(x), length(addleft)), dimnames = list(rownames(x), addleft))
x <- methods::cbind2(x, addleft)
}
addright <- setdiff(colsleft, colsright)
if(length(addright) > 0){
addright <- Matrix::sparseMatrix(i = integer(), j = integer(), x = NA,
dims = c(nrow(y), length(addright)), dimnames = list(rownames(y), addright))
y <- methods::cbind2(y, addright)
}
out <- rbind2(x[, r, drop = FALSE], y[, r, drop = FALSE])
if(length(ldots) > 0){
largs <- ldots[-1]
largs$x <- out
largs$y <- ldots[[1]]
out <- do.call(dtm_rbind, largs)
}
out
}
#' @title Column sums and Row sums for document term matrices
#' @description Column sums and Row sums for document term matrices
#' @param dtm an object returned by \code{\link{document_term_matrix}}
#' @param groups optionally, a list with column/row names or column/row indexes of the \code{dtm} which should be combined by
#' taking the sum over the rows or columns of these. See the examples
#' @return
#' Returns either a vector in case argument \code{groups} is not provided or a sparse matrix of class \code{dgCMatrix}
#' in case argument \code{groups} is provided
#' \itemize{
#' \item{in case \code{groups} is not provided: a vector of row/column sums with corresponding names}
#' \item{in case \code{groups} is provided: a sparse matrix containing summed information over the groups of rows/columns}
#' }
#' @export
#' @aliases dtm_colsums dtm_rowsums
#' @examples
#' x <- data.frame(
#' doc_id = c(1, 1, 2, 3, 4),
#' term = c("A", "C", "Z", "X", "G"),
#' freq = c(1, 5, 7, 10, 0))
#' dtm <- document_term_matrix(x)
#' x <- dtm_colsums(dtm)
#' x
#' x <- dtm_rowsums(dtm)
#' head(x)
#'
#' ##
#' ## Grouped column summation
#' ##
#' x <- list(doc1 = c("aa", "bb", "aa", "b"), doc2 = c("bb", "bb", "BB"))
#' dtm <- document_term_matrix(x)
#' dtm
#' dtm_colsums(dtm, groups = list(combinedB = c("b", "bb"), combinedA = c("aa", "A")))
#' dtm_colsums(dtm, groups = list(combinedA = c("aa", "A")))
#' dtm_colsums(dtm, groups = list(
#' combinedB = grep(pattern = "b", colnames(dtm), ignore.case = TRUE, value = TRUE),
#' combinedA = c("aa", "A", "ZZZ"),
#' test = character()))
#' dtm_colsums(dtm, groups = list())
#'
#' ##
#' ## Grouped row summation
#' ##
#' x <- list(doc1 = c("aa", "bb", "aa", "b"),
#' doc2 = c("bb", "bb", "BB"),
#' doc3 = c("bb", "bb", "BB"),
#' doc4 = c("bb", "bb", "BB", "b"))
#' dtm <- document_term_matrix(x)
#' dtm
#' dtm_rowsums(dtm, groups = list(doc1 = "doc1", combi = c("doc2", "doc3", "doc4")))
#' dtm_rowsums(dtm, groups = list(unknown = "docUnknown", combi = c("doc2", "doc3", "doc4")))
#' dtm_rowsums(dtm, groups = list())
dtm_colsums <- function(dtm, groups){
if(missing(groups)){
Matrix::colSums(dtm)
}else{
dtm_colsums_group(dtm, groups)
}
}
#' @export
#' @rdname dtm_colsums
dtm_rowsums <- function(dtm, groups){
if(missing(groups)){
Matrix::rowSums(dtm)
}else{
dtm_rowsums_group(dtm, groups)
}
}
dtm_colsums_group <- function(x, groups){
stopifnot(is.list(groups))
if(length(groups) == 0){
extra <- matrix(nrow = nrow(x), ncol = 0, byrow = FALSE, dimnames = list(rownames(x)))
extra <- document_term_matrix.matrix(extra, "dgCMatrix")
return(extra)
}
#stopifnot(length(groups) > 0)
#stopifnot(all(sapply(groups, is.character)))
if(is.null(names(groups))){
stop("groups should have names")
}
x_fields <- colnames(x)
groups <- lapply(groups, FUN=function(x){
if(is.character(x)){
x
}else if(is.integer(x) | is.numeric(x)){
x_fields[x]
}else{
stop("groups should contain a list of column names")
}
})
extra <- mapply(fields = groups, column_j = seq_along(groups), FUN=function(fields, column_j){
fields <- which(x_fields %in% fields)
if(length(fields) == 0){
list(i = integer(), j = integer(), x = numeric())
}else{
values <- dtm_rowsums(x[, fields, drop = FALSE])
idx <- which(values != 0)
list(i = idx, j = rep(column_j, length(idx)), x = values[idx])
}
}, SIMPLIFY = FALSE)
extra <- data.table::rbindlist(extra)
extra <- Matrix::sparseMatrix(dims = c(nrow(x), length(groups)),
dimnames = list(rownames(x), names(groups)),
i = extra$i, j = extra$j, x = extra$x)
extra
}
dtm_rowsums_group <- function(x, groups){
stopifnot(is.list(groups))
if(length(groups) == 0){
extra <- matrix(nrow = 0, ncol = ncol(x), byrow = FALSE, dimnames = list(character(), colnames(x)))
extra <- document_term_matrix.matrix(extra, "dgCMatrix")
return(extra)
}
#stopifnot(length(groups) > 0)
#stopifnot(all(sapply(groups, is.character)))
if(is.null(names(groups))){
stop("groups should have names")
}
x_rows <- rownames(x)
groups <- lapply(groups, FUN=function(x){
if(is.character(x)){
x
}else if(is.integer(x) | is.numeric(x)){
x_rows[x]
}else{
stop("groups should contain a list of row names")
}
})
extra <- mapply(rows = groups, row_i = seq_along(groups), FUN=function(rows, row_i){
rows <- which(x_rows %in% rows)
if(length(rows) == 0){
list(i = integer(), j = integer(), x = numeric())
}else{
values <- dtm_colsums(x[rows, , drop = FALSE])
idx <- which(values != 0)
list(i = rep(row_i, length(idx)), j = idx, x = values[idx])
}
}, SIMPLIFY = FALSE)
extra <- data.table::rbindlist(extra)
extra <- Matrix::sparseMatrix(dims = c(length(groups), ncol(x)),
dimnames = list(names(groups), colnames(x)),
i = extra$i, j = extra$j, x = extra$x)
extra
}
#' @title Compare term usage across 2 document groups using the Chi-square Test for Count Data
#' @description Perform a \code{\link{chisq.test}} to compare if groups of documents have more prevalence of specific terms.\cr
#' The function looks to each term in the document term matrix and applies a \code{\link{chisq.test}} comparing the frequency
#' of occurrence of each term compared to the other terms in the document group.
#' @param dtm a document term matrix: an object returned by \code{\link{document_term_matrix}}
#' @param groups a logical vector with 2 groups (TRUE / FALSE) where the size of the \code{groups} vector
#' is the same as the number of rows of \code{dtm} and where element i corresponds row i of \code{dtm}
#' @param correct passed on to \code{\link{chisq.test}}
#' @param ... further arguments passed on to \code{\link{chisq.test}}
#' @export
#' @return a data.frame with columns term, chisq, p.value, freq, freq_true, freq_false indicating for each term in the \code{dtm},
#' how frequently it occurs in each group, the Chi-Square value and it's corresponding p-value.
#' @examples
#' data(brussels_reviews_anno)
#' ##
#' ## Which nouns occur in text containing the term 'centre'
#' ##
#' x <- subset(brussels_reviews_anno, xpos == "NN" & language == "fr")
#' x <- x[, c("doc_id", "lemma")]
#' x <- document_term_frequencies(x)
#' dtm <- document_term_matrix(x)
#' relevant <- dtm_chisq(dtm, groups = dtm[, "centre"] > 0)
#' head(relevant, 10)
#'
#' ##
#' ## Which adjectives occur in text containing the term 'hote'
#' ##
#' x <- subset(brussels_reviews_anno, xpos == "JJ" & language == "fr")
#' x <- x[, c("doc_id", "lemma")]
#' x <- document_term_frequencies(x)
#' dtm <- document_term_matrix(x)
#'
#' group <- subset(brussels_reviews_anno, lemma %in% "hote")
#' group <- rownames(dtm) %in% group$doc_id
#' relevant <- dtm_chisq(dtm, groups = group)
#' head(relevant, 10)
#'
#'
#' \dontrun{
#' # do not show scientific notation of the p-values
#' options(scipen = 100)
#' head(relevant, 10)
#' }
dtm_chisq <- function(dtm, groups, correct = TRUE, ...){
stopifnot(is.logical(groups))
stopifnot(length(unique(groups)) == 2)
stopifnot(length(groups) == nrow(dtm))
recode <- function(x, from, to){
to[match(x, from)]
}
DTM <- dtm_reverse(dtm)
DTM <- data.frame(doc_id = recode(DTM$doc_id, from = rownames(dtm), to = groups),
term = DTM$term,
freq = DTM$freq, stringsAsFactors = FALSE)
DTM <- document_term_frequencies(DTM)
DTM <- document_term_matrix(DTM)
freq_true <- DTM["TRUE" , , drop = TRUE]
freq_false <- DTM["FALSE", , drop = TRUE]
contingencies <- data.frame(term = colnames(DTM),
term_freq_true = freq_true,
term_freq_false = freq_false,
otherterms_freq_true = sum(freq_true) - freq_true,
otherterms_freq_flase = sum(freq_false) - freq_false,
stringsAsFactors = FALSE)
result <- mapply(term = contingencies$term,
a = contingencies$term_freq_true, b = contingencies$term_freq_false,
c = contingencies$otherterms_freq_true, d = contingencies$otherterms_freq_flase,
FUN = function(term, a, b, c, d){
tab <- matrix(c(a, b, c, d), nrow = 2, ncol = 2, byrow = TRUE)
suppressWarnings(result <- chisq.test(tab, correct = correct, ...))
list(term = term,
chisq = result$statistic,
p.value = result$p.value,
freq = a + b,
freq_true = a,
freq_false = b)
}, SIMPLIFY = FALSE)
result <- data.table::rbindlist(result)
result <- result[order(result$chisq, decreasing = TRUE), ]
result <- data.table::setDF(result)
result
}
#' @title Make sure a document term matrix has exactly the specified rows and columns
#' @description Makes sure the document term matrix has exactly the rows and columns which you specify. If missing rows or columns
#' are occurring, the function fills these up either with empty cells or with the value that you provide. See the examples.
#' @param dtm a document term matrix: an object returned by \code{\link{document_term_matrix}}
#' @param rows a character vector of row names which \code{dtm} should have
#' @param columns a character vector of column names which \code{dtm} should have
#' @param fill a value to use to fill up missing rows / columns. Defaults to using an empty cell.
#' @return the sparse matrix \code{dtm} with exactly the specified rows and columns
#' @export
#' @seealso \code{\link{document_term_matrix}}
#' @examples
#' x <- data.frame(doc_id = c("doc_1", "doc_1", "doc_1", "doc_2"),
#' text = c("a", "a", "b", "c"),
#' stringsAsFactors = FALSE)
#' dtm <- document_term_frequencies(x)
#' dtm <- document_term_matrix(dtm)
#' dtm
#' dtm_conform(dtm,
#' rows = c("doc_1", "doc_2", "doc_3"), columns = c("a", "b", "c", "Z", "Y"))
#' dtm_conform(dtm,
#' rows = c("doc_1", "doc_2", "doc_3"), columns = c("a", "b", "c", "Z", "Y"),
#' fill = 1)
#' dtm_conform(dtm, rows = c("doc_1", "doc_3"), columns = c("a", "b", "c", "Z", "Y"))
#' dtm_conform(dtm, columns = c("a", "b", "Z"))
#' dtm_conform(dtm, rows = c("doc_1"))
#' dtm_conform(dtm, rows = character())
#' dtm_conform(dtm, columns = character())
#' dtm_conform(dtm, rows = character(), columns = character())
#'
#' ##
#' ## Some examples on border line cases
#' ##
#' special1 <- dtm[, character()]
#' special2 <- dtm[character(), character()]
#' special3 <- dtm[character(), ]
#'
#' dtm_conform(special1,
#' rows = c("doc_1", "doc_2", "doc_3"), columns = c("a", "b", "c", "Z", "Y"))
#' dtm_conform(special1,
#' rows = c("doc_1", "doc_2", "doc_3"), columns = c("a", "b", "c", "Z", "Y"),
#' fill = 1)
#' dtm_conform(special1, rows = c("doc_1", "doc_3"), columns = c("a", "b", "c", "Z", "Y"))
#' dtm_conform(special1, columns = c("a", "b", "Z"))
#' dtm_conform(special1, rows = c("doc_1"))
#' dtm_conform(special1, rows = character())
#' dtm_conform(special1, columns = character())
#' dtm_conform(special1, rows = character(), columns = character())
#'
#' dtm_conform(special2,
#' rows = c("doc_1", "doc_2", "doc_3"), columns = c("a", "b", "c", "Z", "Y"))
#' dtm_conform(special2,
#' rows = c("doc_1", "doc_2", "doc_3"), columns = c("a", "b", "c", "Z", "Y"),
#' fill = 1)
#' dtm_conform(special2, rows = c("doc_1", "doc_3"), columns = c("a", "b", "c", "Z", "Y"))
#' dtm_conform(special2, columns = c("a", "b", "Z"))
#' dtm_conform(special2, rows = c("doc_1"))
#' dtm_conform(special2, rows = character())
#' dtm_conform(special2, columns = character())
#' dtm_conform(special2, rows = character(), columns = character())
#'
#' dtm_conform(special3,
#' rows = c("doc_1", "doc_2", "doc_3"), columns = c("a", "b", "c", "Z", "Y"))
#' dtm_conform(special3,
#' rows = c("doc_1", "doc_2", "doc_3"), columns = c("a", "b", "c", "Z", "Y"),
#' fill = 1)
#' dtm_conform(special3, rows = c("doc_1", "doc_3"), columns = c("a", "b", "c", "Z", "Y"))
#' dtm_conform(special3, columns = c("a", "b", "Z"))
#' dtm_conform(special3, rows = c("doc_1"))
#' dtm_conform(special3, rows = character())
#' dtm_conform(special3, columns = character())
#' dtm_conform(special3, rows = character(), columns = character())
dtm_conform <- function(dtm, rows, columns, fill){
if(!missing(columns)){
missing_terms <- setdiff(columns, colnames(dtm))
if(length(missing_terms)){
if(!missing(fill)){
extra <- expand.grid(i = seq_len(nrow(dtm)), j = length(missing_terms))
extra <- Matrix::sparseMatrix(dims = c(nrow(dtm), length(missing_terms)), dimnames = list(rownames(dtm), missing_terms), i = extra$i, j = extra$j, x = fill)
}else{
extra <- Matrix::sparseMatrix(dims = c(nrow(dtm), length(missing_terms)), dimnames = list(rownames(dtm), missing_terms), i = {}, j = {})
}
if(nrow(dtm) == 0){
dtm <- methods::cbind2(dtm, extra)
}else{
dtm <- dtm_cbind(dtm, extra)
}
}
}
if(!missing(rows)){
missing_docs <- setdiff(rows, rownames(dtm))
if(length(missing_docs) > 0){
if(!missing(fill)){
extra <- expand.grid(i = seq_len(length(missing_docs)), j = ncol(dtm))
extra <- Matrix::sparseMatrix(dims = c(length(missing_docs), ncol(dtm)), dimnames = list(missing_docs, colnames(dtm)), i = extra$i, j = extra$j, x = fill)
}else{
extra <- Matrix::sparseMatrix(dims = c(length(missing_docs), ncol(dtm)), dimnames = list(missing_docs, colnames(dtm)), i = {}, j = {})
}
if(ncol(dtm) == 0){
dtm <- methods::rbind2(dtm, extra)
}else{
dtm <- udpipe::dtm_rbind(dtm, extra)
}
}
}
if(!missing(rows) & !missing(columns)){
dtm <- dtm[rows, columns, drop = FALSE]
}else if(!missing(rows)){
dtm <- dtm[rows, , drop = FALSE]
}else if(!missing(columns)){
dtm <- dtm[, columns, drop = FALSE]
}
dtm
}
#' @title Reorder a Document-Term-Matrix alongside a vector or data.frame
#' @description This utility function is useful to align a Document-Term-Matrix with
#' information in a data.frame or a vector to predict, such that both the predictive information as well as the target
#' is available in the same order. \cr
#' Matching is done based on the identifiers in the rownames of \code{x} and either the names of the \code{y} vector
#' or the first column of \code{y} in case it is a data.frame.
#' @param x a Document-Term-Matrix of class dgCMatrix (which can be an object returned by \code{\link{document_term_matrix}})
#' @param y either a vector or data.frame containing something to align with \code{x} (e.g. for predictive purposes).
#' \itemize{
#' \item{In case \code{y} is a vector, it should have names which are available in the rownames of \code{x}.}
#' \item{In case \code{y} is a data.frame, it's first column should contain identifiers which are available in the rownames of \code{x}.}
#' }
#' @param FUN a function to be applied on \code{x} before aligning it to \code{y}. See the examples
#' @param ... further arguments passed on to FUN
#' @return a list with elements \code{x} and \code{y} containing the document term matrix \code{x} in the same order as \code{y}.
#' \itemize{
#' \item{If in \code{y} a vector was passed, the returned \code{y} element will be a vector}
#' \item{If in \code{y} a data.frame was passed with more than 2 columns, the returned \code{y} element will be a data.frame}
#' \item{If in \code{y} a data.frame was passed with exactly 2 columns, the returned \code{y} element will be a vector}
#' }
#' Only returns data of \code{x} with overlapping identifiers in \code{y}.
#' @export
#' @seealso \code{\link{document_term_matrix}}
#' @examples
#' x <- matrix(1:9, nrow = 3, dimnames = list(c("a", "b", "c")))
#' x
#' dtm_align(x = x,
#' y = c(b = 1, a = 2, c = 6, d = 6))
#' dtm_align(x = x,
#' y = c(b = 1, a = 2, c = 6, d = 6, d = 7, a = -1))
#'
#' data(brussels_reviews)
#' data(brussels_listings)
#' x <- brussels_reviews
#' x <- strsplit.data.frame(x, term = "feedback", group = "listing_id")
#' x <- document_term_frequencies(x)
#' x <- document_term_matrix(x)
#' y <- brussels_listings$price
#' names(y) <- brussels_listings$listing_id
#'
#' ## align a matrix of predictors with a vector to predict
#' trainset <- dtm_align(x = x, y = y)
#' trainset <- dtm_align(x = x, y = y, FUN = function(dtm){
#' dtm <- dtm_remove_lowfreq(dtm, minfreq = 5)
#' dtm <- dtm_sample(dtm)
#' dtm
#' })
#' head(names(y))
#' head(rownames(x))
#' head(names(trainset$y))
#' head(rownames(trainset$x))
#'
#' ## align a matrix of predictors with a data.frame
#' trainset <- dtm_align(x = x, y = brussels_listings[, c("listing_id", "price")])
#' trainset <- dtm_align(x = x,
#' y = brussels_listings[, c("listing_id", "price", "room_type")])
#' head(trainset$y$listing_id)
#' head(rownames(trainset$x))
#'
#' ## example with duplicate data in case of data balancing
#' dtm_align(x = matrix(1:30, nrow = 3, dimnames = list(c("a", "b", "c"))),
#' y = c(a = 1, a = 2, b = 3, d = 6, b = 6))
#' target <- subset(brussels_listings, listing_id %in% brussels_reviews$listing_id)
#' target <- rbind(target[1:3, ], target[c(2, 3), ], target[c(1, 4), ])
#' trainset <- dtm_align(x = x, y = target[, c("listing_id", "price")])
#' trainset <- dtm_align(x = x, y = setNames(target$price, target$listing_id))
#' names(trainset$y)
#' rownames(trainset$x)
dtm_align <- function(x, y, FUN, ...){
if(!inherits(x, c("dgCMatrix", "matrix"))){
warning(sprintf("expecting x to be of class dgCMatrix, while you passed a %s", paste(class(x), collapse = " ")))
}
if(!missing(FUN)){
x <- FUN(x, ...)
}
if(is.data.frame(y)){
if(ncol(y) < 2){
stop("y is a data.frame, it should have at least 2 columns, where the first is a document id")
}
nm <- y[[1]]
#y <- y[[2]]
#names(y) <- nm
}else if(is.vector(y) | is.factor(y)){
nm <- names(y)
if(is.null(nm)){
stop("y is required to be a vector which has names otherwise we can not align it with the rownames of x")
}
}else{
stop("dtm_match is only implemented for y of type data.frame or with vectors")
}
X <- x[which(rownames(x) %in% nm), , drop = FALSE]
#idx <- match(rownames(X), nm)
idx <- which(nm %in% rownames(X))
if(NCOL(y) == 1){
Y <- y[idx]
}else if(NCOL(y) == 2){
Y <- y[[2]]
names(Y) <- nm
Y <- Y[idx]
}else{
Y <- y[idx, , drop = FALSE]
}
X <- X[match(nm[idx], rownames(X)), , drop = FALSE]
structure(list(x = X, y = Y), class = "dtm_aligned")
}
#' @title Random samples and permutations from a Document-Term-Matrix
#' @description Sample the specified number of rows from the Document-Term-Matrix using either with or without replacement.
#' @param dtm a document term matrix of class dgCMatrix (which can be an object returned by \code{\link{document_term_matrix}})
#' @param size a positive number, the number of rows to sample
#' @param replace should sampling be with replacement
#' @param prob a vector of probability weights, one for each row of \code{x}
#' @export
#' @return \code{dtm} with as many rows as specified in \code{size}
#' @examples
#' x <- list(doc1 = c("aa", "bb", "cc", "aa", "b"),
#' doc2 = c("bb", "bb", "dd", ""),
#' doc3 = character(),
#' doc4 = c("cc", NA),
#' doc5 = character())
#' dtm <- document_term_matrix(x)
#' dtm_sample(dtm, size = 2)
#' dtm_sample(dtm, size = 3)
#' dtm_sample(dtm, size = 2)
#' dtm_sample(dtm, size = 8, replace = TRUE)
#' dtm_sample(dtm, size = 8, replace = TRUE, prob = c(1, 1, 0.01, 0.5, 0.01))
dtm_sample <- function(dtm, size = nrow(dtm), replace = FALSE, prob = NULL){
idx <- sample.int(n = nrow(dtm), size = size, replace = replace, prob = prob)
dtm[idx, ]
}
#' @title Semantic Similarity to a Singular Value Decomposition
#' @description Calculate the similarity of a document term matrix to a set of terms based on
#' a Singular Value Decomposition (SVD) embedding matrix.\cr
#' This can be used to easily construct a sentiment score based on the latent scale defined by a set of positive or negative terms.
#' @param dtm a sparse matrix such as a "dgCMatrix" object which is returned by \code{\link{document_term_matrix}} containing frequencies of terms for each document
#' @param embedding a matrix containing the \code{v} element from an singular value decomposition with the right singular vectors.
#' The rownames of that matrix should contain terms which are available in the \code{colnames(dtm)}. See the examples.
#' @param weights a numeric vector with weights giving your definition of which terms are positive or negative,
#' The names of this vector should be terms available in the rownames of the embedding matrix. See the examples.
#' @param terminology a character vector of terms to limit the calculation of the similarity for the \code{dtm} to the linear combination of the weights.
#' Defaults to all terms from the \code{embedding} matrix.
#' @param type either 'cosine' or 'dot' indicating to respectively calculate cosine similarities or inner product similarities between the \code{dtm} and the SVD embedding space. Defaults to 'cosine'.
#' @export
#' @return an object of class 'svd_similarity' which is a list with elements
#' \itemize{
#' \item weights: The weights used. These are scaled to sum up to 1 as well on the positive as the negative side
#' \item type: The type of similarity calculated (either 'cosine' or 'dot')
#' \item terminology: A data.frame with columns term, freq and similarity where similarity indicates
#' the similarity between the term and the SVD embedding space of the weights and freq is how frequently the term occurs in the \code{dtm}.
#' This dataset is sorted in descending order by similarity.
#' \item similarity: A data.frame with columns doc_id and similarity indicating the similarity between
#' the \code{dtm} and the SVD embedding space of the weights. The doc_id is the identifier taken from the rownames of \code{dtm}.
#' \item scale: A list with elements terminology and weights
#' indicating respectively the similarity in the SVD embedding space
#' between the \code{terminology} and each of the weights and between the weight terms itself
#' }
#' @seealso \url{https://en.wikipedia.org/wiki/Latent_semantic_analysis}
#' @examples
#' data("brussels_reviews_anno", package = "udpipe")
#' x <- subset(brussels_reviews_anno, language %in% "nl" & (upos %in% "ADJ" | lemma %in% "niet"))
#' dtm <- document_term_frequencies(x, document = "doc_id", term = "lemma")
#' dtm <- document_term_matrix(dtm)
#' dtm <- dtm_remove_lowfreq(dtm, minfreq = 3)
#'
#' ## Function performing Singular Value Decomposition on sparse/dense data
#' dtm_svd <- function(dtm, dim = 5, type = c("RSpectra", "svd"), ...){
#' type <- match.arg(type)
#' if(type == "svd"){
#' SVD <- svd(dtm, nu = 0, nv = dim, ...)
#' }else if(type == "RSpectra"){
#' #Uncomment this if you want to use the faster sparse SVD by RSpectra
#' #SVD <- RSpectra::svds(dtm, nu = 0, k = dim, ...)
#' }
#' rownames(SVD$v) <- colnames(dtm)
#' SVD$v
#' }
#' #embedding <- dtm_svd(dtm, dim = 5)
#' embedding <- dtm_svd(dtm, dim = 5, type = "svd")
#'
#' ## Define positive / negative terms and calculate the similarity to these
#' weights <- setNames(c(1, 1, 1, 1, -1, -1, -1, -1),
#' c("fantastisch", "schoon", "vriendelijk", "net",
#' "lawaaiig", "lastig", "niet", "slecht"))
#' scores <- dtm_svd_similarity(dtm, embedding = embedding, weights = weights)
#' scores
#' str(scores$similarity)
#' hist(scores$similarity$similarity)
#'
#' plot(scores$terminology$similarity_weight, log(scores$terminology$freq),
#' type = "n")
#' text(scores$terminology$similarity_weight, log(scores$terminology$freq),
#' labels = scores$terminology$term)
#'
#' \dontrun{
#' ## More elaborate example using word2vec
#' ## building word2vec model on all Dutch texts,
#' ## finding similarity of dtm to adjectives only
#' set.seed(123)
#' library(word2vec)
#' text <- subset(brussels_reviews_anno, language == "nl")
#' text <- paste.data.frame(text, term = "lemma", group = "doc_id")
#' text <- text$lemma
#' model <- word2vec(text, dim = 10, iter = 20, type = "cbow", min_count = 1)
#' predict(model, newdata = names(weights), type = "nearest", top_n = 3)
#' embedding <- as.matrix(model)
#' }
#' data(brussels_reviews_w2v_embeddings_lemma_nl)
#' embedding <- brussels_reviews_w2v_embeddings_lemma_nl
#' adjective <- subset(brussels_reviews_anno, language %in% "nl" & upos %in% "ADJ")
#' adjective <- txt_freq(adjective$lemma)
#' adjective <- subset(adjective, freq >= 5 & nchar(key) > 1)
#' adjective <- adjective$key
#'
#' scores <- dtm_svd_similarity(dtm, embedding, weights = weights, type = "dot",
#' terminology = adjective)
#' scores
#' plot(scores$terminology$similarity_weight, log(scores$terminology$freq),
#' type = "n")
#' text(scores$terminology$similarity_weight, log(scores$terminology$freq),
#' labels = scores$terminology$term, cex = 0.8)
dtm_svd_similarity <- function(dtm, embedding, weights, terminology = rownames(embedding), type = c("cosine", "dot")){
doc_id <- term <- prop <- in_terminology <- NULL
embedding_similarity <- function(x, y, type = c("cosine", "dot")) {
if(!is.matrix(x)){
x <- matrix(x, nrow = 1)
}
if(!is.matrix(y)){
y <- matrix(y, nrow = 1)
}
type <- match.arg(type)
if(type == "dot"){
similarities <- tcrossprod(x, y)
}else if(type == "cosine"){
normx <- sqrt(rowSums(x^2))
normy <- sqrt(rowSums(y^2))
similarities <- tcrossprod(x, y) / (normx %o% normy)
}
similarities
}
setNames <- function(x, labels){
names(x) <- labels
x
}
##
## DATA CHECKS
##
type <- match.arg(type)
stopifnot(is.matrix(embedding))
stopifnot(is.vector(weights))
object <- list(dim = ncol(embedding),
terminology = rownames(embedding),
embedding = as.matrix(embedding))
if(missing(terminology)){
terminology <- object$terminology
}else{
missing_embeddings <- setdiff(terminology, object$terminology)
if(length(missing_embeddings) > 0){
warning(sprintf("Removing '%s' from terminology as these are not part of rownames(embedding)", paste(missing_embeddings, collapse = ", ")))
}
terminology <- intersect(terminology, object$terminology)
}
not_known_weights <- setdiff(names(weights), object$terminology)
if(length(not_known_weights) > 0){
warning(sprintf("Removing '%s' from weights as these are not part of rownames(embedding)", paste(not_known_weights, collapse = ", ")))
weights <- weights[!names(weights) %in% not_known_weights]
}
stopifnot(length(weights) > 0)
if(length(unique(sign(weights))) != 2){
stop("weights should contain positive as well as negative values")
}
## rescale weights to make sure positive values sum to 1 and negative also sum to -1
weights_scaled <- data.frame(term = names(weights),
weight = weights,
sign = sign(weights),
stringsAsFactors = FALSE)
weights_scaled <- lapply(X = split(weights_scaled, weights_scaled$sign),
FUN = function(x){
x$weight_scaled = x$weight / abs(sum(x$weight))
x
})
weights_scaled <- do.call(rbind, weights_scaled)
weights_scaled <- setNames(weights_scaled$weight_scaled, weights_scaled$term)
#weights_scaled <- setNames(weights_scaled$weight / nrow(weights_scaled), weights_scaled$term)
####################################################################################
## Calculate embedding similarity of terminology to weights, weights-to-weights
## and the embedding space of the linear combination of the weights
##
weightsterms <- names(weights_scaled)
similarity_terminology_to_weights <- embedding_similarity(
object$embedding[terminology, , drop = FALSE],
object$embedding[weightsterms, , drop = FALSE],
type = type)
similarity_weights_to_weights <- embedding_similarity(
object$embedding[weightsterms, , drop = FALSE],
object$embedding[weightsterms, , drop = FALSE],
type = type)
## weighted term-to-weights similarity reflecting a scale on which -1 to +1 range lies
weightspace <- similarity_terminology_to_weights %*% weights_scaled
weightspace <- weightspace[, 1]
##
## Predict (for terms part of the terminology, get the weights embedding space)
##
freq <- dtm_colsums(dtm)
dtf <- dtm_reverse(dtm)
dtf <- data.table::setDT(dtf)
dtf <- dtf[, in_terminology := term %in% terminology, ]
#dtf <- dtf[, prop := as.numeric(freq / sum(freq)), by = list(doc_id)]
dtf <- dtf[, prop := as.numeric(ifelse(any(in_terminology), freq[in_terminology] / sum(freq[in_terminology]), 0)), by = list(doc_id)]
dtm <- document_term_matrix(dtf, weight = "prop", vocabulary = terminology)
dtm <- dtm[, terminology, drop = FALSE]
scores <- dtm %*% weightspace
scores <- scores[, 1]
scores <- data.frame(doc_id = names(scores),
similarity = as.numeric(scores),
stringsAsFactors = FALSE)
terminology_similarity <- sort(weightspace, decreasing = TRUE)
terminology_similarity <- data.frame(
term = names(terminology_similarity),
freq = txt_recode(names(terminology_similarity), from = names(freq), to = as.integer(freq), na.rm = TRUE),
similarity_weight = as.numeric(terminology_similarity),
stringsAsFactors = FALSE)
terminology_similarity$freq <- ifelse(is.na(terminology_similarity$freq), 0, terminology_similarity$freq)
result <- list(weights = weights_scaled,
type = type,
terminology = terminology_similarity,
similarity = scores,
scale = list(
terminology = similarity_terminology_to_weights,
weights = similarity_weights_to_weights))
class(result) <- "svd_similarity"
result
}
#' @export
print.svd_similarity <- function(x, n = 7, digits = 2, ...){
cat(sprintf("Latent Semantic Scaling using %s similarity on SVD matrix", x$type), sep = "\n")
cat(sprintf("Weights: %s", paste(sprintf("%s %s", names(x$weights), round(x$weights, 4)), collapse = ", ")), sep = "\n")
cat(sprintf("Top %s most similar terms on the high end of the scale", n), sep = "\n")
elements <- head(x$terminology, n = n, sep = "\n")
mapply(term = elements$term, freq = elements$freq, similarity = elements$similarity, FUN=function(term, freq, similarity){
cat(sprintf(" - %s %s (frequency: %s)", term, round(similarity, digits = digits), freq), sep = "\n")
})
cat(sprintf("Top %s most similar terms on the low end of the scale", n), sep = "\n")
elements <- tail(x$terminology, n = n, sep = "\n")
elements <- elements[order(elements$similarity, decreasing = FALSE), ]
mapply(term = elements$term, freq = elements$freq, similarity = elements$similarity, FUN=function(term, freq, similarity){
cat(sprintf(" - %s %s (frequency: %s)", term, round(similarity, digits = digits), freq), sep = "\n")
})
invisible()
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.