R/nlp_flow.R

Defines functions print.svd_similarity dtm_svd_similarity dtm_sample dtm_align dtm_conform dtm_chisq dtm_rowsums_group dtm_colsums_group dtm_rowsums dtm_colsums dtm_rbind dtm_cbind dtm_cor dtm_remove_terms dtm_remove_tfidf dtm_tfidf dtm_remove_sparseterms dtm_remove_lowfreq dtm_reverse document_term_matrix.simple_triplet_matrix document_term_matrix.TermDocumentMatrix document_term_matrix.DocumentTermMatrix document_term_matrix.default document_term_matrix.numeric document_term_matrix.integer document_term_matrix.matrix document_term_matrix.data.frame document_term_matrix document_term_frequencies_statistics document_term_frequencies.character dtf document_term_frequencies.data.frame document_term_frequencies

Documented in document_term_frequencies document_term_frequencies.character document_term_frequencies.data.frame document_term_frequencies_statistics document_term_matrix document_term_matrix.data.frame document_term_matrix.default document_term_matrix.DocumentTermMatrix document_term_matrix.integer document_term_matrix.matrix document_term_matrix.numeric document_term_matrix.simple_triplet_matrix document_term_matrix.TermDocumentMatrix dtm_align dtm_cbind dtm_chisq dtm_colsums dtm_conform dtm_cor dtm_rbind dtm_remove_lowfreq dtm_remove_sparseterms dtm_remove_terms dtm_remove_tfidf dtm_reverse dtm_rowsums dtm_sample dtm_svd_similarity dtm_tfidf

#' @title Aggregate a data.frame to the document/term level by calculating how many times a term occurs per document
#' @description Aggregate a data.frame to the document/term level by calculating how many times a term occurs per document
#' @param x a data.frame or data.table containing a field which can be considered 
#' as a document (defaults to the first column in \code{x}) and a field which can be considered as a term 
#' (defaults to the second column in \code{x}). If the dataset also contains a column called 'freq', this will be summed over instead of counting the number 
#' of rows occur by document/term combination.\cr
#' If \code{x} is a character vector containing several terms, the text will be split by the argument \code{split}
#' before doing the agregation at the document/term level.
#' @param document If \code{x} is a data.frame, the column in \code{x} which identifies a document. If \code{x}
#' is a character vector then \code{document} is a vector of the same length as \code{x} where \code{document[i]} is the
#' document id which corresponds to the text in \code{x[i]}.
#' @param term If \code{x} is a data.frame, the column in \code{x} which identifies a term. Defaults to the second column
#' in \code{x}.
#' @param ... further arguments passed on to the methods
#' @return a data.table with columns doc_id, term, freq indicating how many times a term occurred in each document.
#' If freq occurred in the input dataset the resulting data will have summed the freq. If freq is not in the dataset,
#' will assume that freq is 1 for each row in the input dataset \code{x}.
#' @export
#' @examples 
#' \dontshow{
#' data.table::setDTthreads(1)
#' }
#' ##
#' ## Calculate document_term_frequencies on a data.frame
#' ##
#' data(brussels_reviews_anno)
#' \dontshow{
#' brussels_reviews_anno <- subset(brussels_reviews_anno, language %in% "nl")
#' }
#' x <- document_term_frequencies(brussels_reviews_anno[, c("doc_id", "token")])
#' x <- document_term_frequencies(brussels_reviews_anno[, c("doc_id", "lemma")])
#' str(x)
#' 
#' brussels_reviews_anno$my_doc_id <- paste(brussels_reviews_anno$doc_id, 
#'                                          brussels_reviews_anno$sentence_id)
#' x <- document_term_frequencies(brussels_reviews_anno[, c("my_doc_id", "lemma")])
#' 
#' ##
#' ## Calculate document_term_frequencies on a character vector
#' ##
#' data(brussels_reviews)
#' x <- document_term_frequencies(x = brussels_reviews$feedback, document = brussels_reviews$id, 
#'                                split = " ")
#' x <- document_term_frequencies(x = brussels_reviews$feedback, document = brussels_reviews$id, 
#'                                split = "[[:space:][:punct:][:digit:]]+")
#'                                
#' ##
#' ## document-term-frequencies on several fields to easily include bigram and trigrams
#' ##
#' library(data.table)
#' x <- as.data.table(brussels_reviews_anno)
#' x <- x[, token_bigram  := txt_nextgram(token, n = 2), by = list(doc_id, sentence_id)]
#' x <- x[, token_trigram := txt_nextgram(token, n = 3), by = list(doc_id, sentence_id)]
#' x <- document_term_frequencies(x = x, 
#'                                document = "doc_id", 
#'                                term = c("token", "token_bigram", "token_trigram"))
#' head(x)
document_term_frequencies <- function(x, document, ...){
  UseMethod("document_term_frequencies")
}

#' @describeIn document_term_frequencies Create a data.frame with one row per document/term combination indicating the frequency of the term in the document
#' @export
document_term_frequencies.data.frame <- function(x, document = colnames(x)[1], term = colnames(x)[2], ...){
  result <- list()
  for(field in term){
    result[[field]] <- dtf(x, document, term = field, ...)
  }
  result <- data.table::rbindlist(result)
  result <- dtf(result)
  result
}

dtf <- function(x, document = colnames(x)[1], term = colnames(x)[2], ...){
  # r cmd check happiness
  freq <- .N <- NULL
  
  x <- setDT(x)
  if("freq" %in% colnames(x)){
    result <- x[, list(freq = as.integer(sum(freq))), by = c(document, term)]  
  }else{
    result <- x[, list(freq = as.integer(.N)), by = c(document, term)]  
  }
  if(!"term" %in% colnames(result)){
    setnames(result, old = colnames(result)[2], new = c("term"))  
  }
  if(!"doc_id" %in% colnames(result)){
    setnames(result, old = colnames(result)[1], new = c("doc_id"))  
  }
  result<- result[!is.na(term), ]
  result
}


#' @describeIn document_term_frequencies Create a data.frame with one row per document/term combination indicating the frequency of the term in the document
#' @param split The regular expression to be used if \code{x} is a character vector. 
#' This will split the character vector \code{x} in pieces by the provides split argument. 
#' Defaults to splitting according to spaces/punctuations/digits.
#' @export
document_term_frequencies.character <- function(x, document=paste("doc", seq_along(x), sep=""), split = "[[:space:][:punct:][:digit:]]+", ...){
  txt <- NULL
  dt <- data.table(document = as.character(document), txt = x)
  dt <- dt[!is.na(txt), list(term = unlist(strsplit(txt, split = split))), by = list(document)]
  dt <- document_term_frequencies(dt)
  dt
}


#' @title Add Term Frequency, Inverse Document Frequency and Okapi BM25 statistics to the output of document_term_frequencies
#' @description Term frequency Inverse Document Frequency (tfidf) is calculated as the multiplication of
#' \itemize{
#' \item Term Frequency (tf): how many times the word occurs in the document / how many words are in the document
#' \item Inverse Document Frequency (idf): log(number of documents / number of documents where the term appears)
#' }
#' The Okapi BM25 statistic is calculated as the multiplication of the inverse document frequency
#' and the weighted term frequency as defined at \url{https://en.wikipedia.org/wiki/Okapi_BM25}.
#' @param x a data.table as returned by \code{document_term_frequencies} containing the columns doc_id, term and freq.
#' @param k parameter k1 of the Okapi BM25 ranking function as defined at \url{https://en.wikipedia.org/wiki/Okapi_BM25}. Defaults to 1.2.
#' @param b parameter b of the Okapi BM25 ranking function as defined at \url{https://en.wikipedia.org/wiki/Okapi_BM25}. Defaults to 0.5.
#' @return a data.table with columns doc_id, term, freq and added to that the computed statistics
#' tf, idf, tfidf, tf_bm25 and bm25.
#' @export
#' @examples 
#' data(brussels_reviews_anno)
#' \dontshow{
#' data.table::setDTthreads(1)
#' brussels_reviews_anno <- subset(brussels_reviews_anno, language %in% "nl")
#' }
#' x <- document_term_frequencies(brussels_reviews_anno[, c("doc_id", "token")])
#' x <- document_term_frequencies_statistics(x)
#' head(x)
document_term_frequencies_statistics <- function(x, k = 1.2, b = 0.75){
  ## r cmd check happiness
  doc_id <- term <- freq <- tf <- idf <- tf_idf <- tf_bm25 <- bm25 <- NULL

  ##
  ## Term Frequency Inverse Document Frequency
  ##   - tf: how many times the word occurs in the document / how many words are in the document
  ##   - idf: number of documents / number of documents where the term appears
  ##
  nr_docs <- length(unique(x$doc_id))
  x <- x[, tf := freq / sum(freq), by = list(doc_id)]
  x <- x[, idf := log(nr_docs / uniqueN(doc_id)), by = list(term)]
  x <- x[, tf_idf := tf * idf, ]
  
  ##
  ## Okapi BM25
  ##   - See https://en.wikipedia.org/wiki/Okapi_BM25
  ##
  average_doc_length <- x[, list(nr_terms = sum(freq)), by = list(doc_id)]
  average_doc_length <- mean(average_doc_length$nr_terms)
  x <- x[, tf_bm25 := (freq * (k + 1)) / (freq + (k * (1 - b + b * (sum(freq) / average_doc_length)))), by = list(doc_id)]
  x <- x[, bm25 := tf_bm25 * idf, ]
  x
}



#' @title Create a document/term matrix 
#' @description Create a document/term matrix from either
#' \itemize{
#' \item a data.frame with 1 row per document/term as returned by \code{\link{document_term_frequencies}}
#' \item a list of tokens from e.g. from package sentencepiece, tokenizers.bpe or just by using strsplit
#' \item an object of class DocumentTermMatrix or TermDocumentMatrix from the tm package
#' \item an object of class simple_triplet_matrix from the slam package
#' \item a regular dense matrix
#' }
#' @param x a data.frame with columns doc_id, term and freq indicating how many times a term occurred in that specific document. This is what \code{\link{document_term_frequencies}} returns.\cr
#' This data.frame will be reshaped to a matrix with 1 row per doc_id, the terms will be put 
#' in the columns and the freq in the matrix cells. Note that the column name to use for freq can be set in the \code{weight} argument.
#' @param vocabulary a character vector of terms which should be present in the document term matrix even if they did not occur in \code{x}
#' @param weight a column of \code{x} indicating what to put in the matrix cells. Defaults to 'freq' indicating to use column \code{freq} from \code{x} to put into the matrix cells
#' @param ... further arguments currently not used
#' @return an sparse object of class dgCMatrix with in the rows the documents and in the columns the terms containing the frequencies
#' provided in \code{x} extended with terms which were not in \code{x} but were provided in \code{vocabulary}.
#' The rownames of this resulting object contain the doc_id from \code{x}
#' @export
#' @seealso \code{\link[Matrix]{sparseMatrix}}, \code{\link{document_term_frequencies}}
#' @examples 
#' \dontshow{
#' data.table::setDTthreads(1)
#' }
#' x <- data.frame(doc_id = c(1, 1, 2, 3, 4), 
#'  term = c("A", "C", "Z", "X", "G"), 
#'  freq = c(1, 5, 7, 10, 0))
#' document_term_matrix(x)
#' document_term_matrix(x, vocabulary = LETTERS)
#' 
#' ## Example on larger dataset
#' data(brussels_reviews_anno)
#' \dontshow{
#' brussels_reviews_anno <- subset(brussels_reviews_anno, language %in% "nl")
#' }
#' x   <- document_term_frequencies(brussels_reviews_anno[, c("doc_id", "lemma")])
#' dtm <- document_term_matrix(x)
#' dim(dtm)
#' x   <- document_term_frequencies(brussels_reviews_anno[, c("doc_id", "lemma")])
#' x   <- document_term_frequencies_statistics(x)
#' dtm <- document_term_matrix(x)
#' dtm <- document_term_matrix(x, weight = "freq")
#' dtm <- document_term_matrix(x, weight = "tf_idf")
#' dtm <- document_term_matrix(x, weight = "bm25")
#' x   <- split(brussels_reviews_anno$lemma, brussels_reviews_anno$doc_id)
#' dtm <- document_term_matrix(x)
#' ## example showing the vocubulary argument
#' ## allowing you to making sure terms which are not in the data are provided in the resulting dtm
#' allterms <- unique(x$term)
#' dtm <- document_term_matrix(head(x, 1000), vocabulary = allterms)
#' 
#' ## example for a list of tokens
#' x <- list(doc1 = c("aa", "bb", "cc", "aa", "b"), 
#'           doc2 = c("bb", "bb", "dd", ""), 
#'           doc3 = character(),
#'           doc4 = c("cc", NA), 
#'           doc5 = character())
#' document_term_matrix(x)
#' dtm <- document_term_matrix(x, vocabulary = c("a", "bb", "cc"))
#' dtm <- dtm_conform(dtm, rows = c("doc1", "doc2", "doc7"), columns = c("a", "bb", "cc"))
#' data(brussels_reviews)
#' x   <- strsplit(setNames(brussels_reviews$feedback, brussels_reviews$id), split = " +")
#' x   <- document_term_matrix(x)
#' 
#' 
#' ##
#' ## Example adding bigrams/trigrams to the document term matrix
#' ## Mark that this can also be done using ?dtm_cbind
#' ##
#' library(data.table)
#' x <- as.data.table(brussels_reviews_anno)
#' x <- x[, token_bigram  := txt_nextgram(token, n = 2), by = list(doc_id, sentence_id)]
#' x <- x[, token_trigram := txt_nextgram(token, n = 3), by = list(doc_id, sentence_id)]
#' x <- document_term_frequencies(x = x, 
#'                                document = "doc_id", 
#'                                term = c("token", "token_bigram", "token_trigram"))
#' dtm <- document_term_matrix(x)
#' 
#' ##
#' ## Convert dense matrix to sparse matrix
#' ##
#' x <- matrix(c(0, 0, 0, 1, NA, 3, 4, 5, 6, 7), nrow = 2)
#' x
#' dtm <- document_term_matrix(x)
#' dtm
#' x <- matrix(c(0, 0, 0, 0.1, NA, 0.3, 0.4, 0.5, 0.6, 0.7), nrow = 2)
#' x
#' dtm <- document_term_matrix(x)
#' dtm
#' x   <- setNames(c(TRUE, NA, FALSE, FALSE), c("a", "b", "c", "d"))
#' x   <- as.matrix(x)
#' dtm <- document_term_matrix(x)
#' dtm
#' 
#' ##
#' ## Convert vectors to sparse matrices
#' ##
#' x   <- setNames(-3:3, c("a", "b", "c", "d", "e", "f"))
#' dtm <- document_term_matrix(x)
#' dtm
#' x   <- setNames(runif(6), c("a", "b", "c", "d", "e", "f"))
#' dtm <- document_term_matrix(x)
#' dtm
#' 
#' ##
#' ## Convert lists to sparse matrices
#' ##
#' x   <- list(a = c("some", "set", "of", "words"), 
#'             b1 = NA,
#'             b2 = NA,  
#'             c1 = character(),
#'             c2 = 0,  
#'             d = c("words", "words", "words"))
#' dtm <- document_term_matrix(x)
#' dtm
document_term_matrix <- function(x, vocabulary, weight = "freq", ...){
  UseMethod("document_term_matrix")
}

#' @describeIn document_term_matrix Construct a document term matrix from a data.frame with columns doc_id, term, freq
#' @export
document_term_matrix.data.frame <- function(x, vocabulary, weight = "freq", ...){
  stopifnot(all(c("doc_id", "term", weight) %in% colnames(x)))
  #stopifnot(ncol(x) == 3)
  
  x$document <- as.character(x$doc_id)
  x$document <- factor(x$document, levels = setdiff(unique(x$document), NA))
  
  term <- as.character(x$term)
  if(!missing(vocabulary)){
    x$term <- factor(term, levels = setdiff(unique(c(vocabulary, levels(factor(term)))), NA))
  }else{
    x$term <- factor(term)
  }
  
  doclabels <- levels(x$document)
  termlabels <- levels(x$term)
  
  x$document <- as.integer(x$document)
  x$term <- as.integer(x$term)
  
  dtm <- sparseMatrix(i=x$document, j = x$term, x = x[[weight]], dims = c(length(doclabels), length(termlabels)),
                      dimnames = list(doclabels, termlabels))
  dtm
}


#' @describeIn document_term_matrix Construct a sparse document term matrix from a matrix
#' @export
document_term_matrix.matrix <- function(x, ...){
  x <- as(x, "dgCMatrix")
  #x <- as(as(as(x, "dMatrix"), "generalMatrix"), "CsparseMatrix")
  #x <- Matrix(data = x, sparse = TRUE, doDiag = FALSE, dimnames = dimnames(x))
  #if(is.logical(x)){
  #  x[] <- as.integer(x)
  #}
  #x <- as(x, "CsparseMatrix")
  x
}

#' @describeIn document_term_matrix Construct a sparse document term matrix from an named integer vector
#' @export
document_term_matrix.integer <- function(x, ...){
  if(anyDuplicated(names(x))){
    stop("x has duplicate names")
  }
  x <- as.matrix(x)
  x <- document_term_matrix.matrix(x)
  x
}

#' @describeIn document_term_matrix Construct a sparse document term matrix from a named numeric vector
#' @export
document_term_matrix.numeric <- function(x, ...){
  if(anyDuplicated(names(x))){
    stop("x has duplicate names")
  }
  x <- as.matrix(x)
  x <- document_term_matrix.matrix(x)
  x
}

#' @describeIn document_term_matrix Construct a document term matrix from a list of tokens
#' @export
document_term_matrix.default <- function(x, vocabulary, ...){  
  stopifnot(is.list(x))
  .N <- document <- term <- NULL
  docs <- names(x)
  if(is.null(docs)){
    docs <- seq_along(x)
  }else{
    if(anyDuplicated(docs)){
      stop("x has duplicate names")
    }
  }
  x <- list(document = rep(x = docs, times = sapply(x, length)),
            term     = unlist(x, use.names = FALSE))
  x$document <- factor(x$document, levels = docs)
  if(!missing(vocabulary)){
    x$term <- factor(x$term, levels = setdiff(unique(c(vocabulary, levels(factor(x$term)))), NA))
  }else{
    x$term <- factor(x$term)
  }
  x <- data.table::setDT(x)
  x <- x[!is.na(term), list(freq = .N), by = list(document, term)]
  
  doclabels  <- levels(x$document)
  termlabels <- levels(x$term)
  
  x$document <- as.integer(x$document)
  x$term     <- as.integer(x$term)
  
  dtm <- Matrix::sparseMatrix(i=x$document, j = x$term, x = x$freq, 
                      dims = c(length(doclabels), length(termlabels)),
                      dimnames = list(doclabels, termlabels))
  dtm
}



#' @describeIn document_term_matrix Convert an object of class \code{DocumentTermMatrix} from the tm package to a sparseMatrix
#' @export
document_term_matrix.DocumentTermMatrix <- function(x, ...){
  dtm <- Matrix::sparseMatrix(i=x$i, j = x$j, x = x$v, 
                              dims = c(x$nrow, x$ncol),
                              dimnames = x$dimnames)
  dtm
}

#' @describeIn document_term_matrix Convert an object of class \code{TermDocumentMatrix} from the tm package to a sparseMatrix with
#' the documents in the rows and the terms in the columns
#' @export
document_term_matrix.TermDocumentMatrix <- function(x, ...){
  dtm <- document_term_matrix.DocumentTermMatrix(x)
  dtm <- Matrix::t(dtm)
  dtm
}

#' @describeIn document_term_matrix Convert an object of class \code{simple_triplet_matrix} from the slam package to a sparseMatrix
#' @export
document_term_matrix.simple_triplet_matrix <- function(x, ...){
  dtm <- Matrix::sparseMatrix(i=x$i, j = x$j, x = x$v, 
                              dims = c(x$nrow, x$ncol),
                              dimnames = x$dimnames)
  dtm
}




#' @title Inverse operation of the document_term_matrix function
#' @description Inverse operation of the \code{\link{document_term_matrix}} function. 
#' Creates frequency table which contains 1 row per document/term
#' @param x an object as returned by \code{\link{document_term_matrix}}
#' @return a data.frame with columns doc_id, term and freq where freq is just the value in each
#' cell of the \code{x}
#' @export
#' @seealso \code{\link{document_term_matrix}}
#' @examples 
#' x <- data.frame(
#'  doc_id = c(1, 1, 2, 3, 4), 
#'  term = c("A", "C", "Z", "X", "G"), 
#'  freq = c(1, 5, 7, 10, 0))
#' dtm <- document_term_matrix(x)
#' dtm_reverse(dtm)
dtm_reverse <- function(x){
  m <- Matrix::summary(x)
  data.frame(doc_id = rownames(x)[m$i],
             term = colnames(x)[m$j],
             freq = m$x,
             stringsAsFactors = FALSE)
}



#' @title Remove terms occurring with low frequency from a Document-Term-Matrix and documents with no terms
#' @description Remove terms occurring with low frequency from a Document-Term-Matrix and documents with no terms
#' @param dtm an object returned by \code{\link{document_term_matrix}}
#' @param minfreq integer with the minimum number of times the term should occur in order to keep the term
#' @param maxterms integer indicating the maximum number of terms which should be kept in the \code{dtm}. The argument is optional. 
#' @param remove_emptydocs logical indicating to remove documents containing no more terms after the term removal is executed. Defaults to \code{TRUE}.
#' @return a sparse Matrix as returned by \code{sparseMatrix} 
#' where terms with low occurrence are removed and documents without any terms are also removed
#' @export
#' @examples 
#' data(brussels_reviews_anno)
#' x <- subset(brussels_reviews_anno, xpos == "NN")
#' x <- x[, c("doc_id", "lemma")]
#' x <- document_term_frequencies(x)
#' dtm <- document_term_matrix(x)
#' 
#' 
#' ## Remove terms with low frequencies and documents with no terms
#' x <- dtm_remove_lowfreq(dtm, minfreq = 10)
#' dim(x)
#' x <- dtm_remove_lowfreq(dtm, minfreq = 10, maxterms = 25)
#' dim(x)
#' x <- dtm_remove_lowfreq(dtm, minfreq = 10, maxterms = 25, remove_emptydocs = FALSE)
#' dim(x)
dtm_remove_lowfreq <- function(dtm, minfreq=5, maxterms, remove_emptydocs = TRUE){
  dtm <- dtm[, which(Matrix::colSums(dtm) >= minfreq), drop = FALSE]
  if(!missing(maxterms)){
    terms <- head(sort(Matrix::colSums(dtm), decreasing = TRUE), n = maxterms)
    dtm <- dtm[, which(colnames(dtm) %in% names(terms)), drop = FALSE]
  }
  if(remove_emptydocs){
    dtm <- dtm[Matrix::rowSums(dtm) > 0, , drop = FALSE]  
  }
  dtm
}


#' @title Remove terms with high sparsity from a Document-Term-Matrix
#' @description Remove terms with high sparsity from a Document-Term-Matrix and remove documents with no terms.\cr
#' Sparsity indicates in how many documents the term is not occurring.
#' @param dtm an object returned by \code{\link{document_term_matrix}}
#' @param sparsity numeric in 0-1 range indicating the sparsity percent. Defaults to 0.99 meaning drop terms which occur in less than 1 percent of the documents.
#' @param remove_emptydocs logical indicating to remove documents containing no more terms after the term removal is executed. Defaults to \code{TRUE}.
#' @return a sparse Matrix as returned by \code{sparseMatrix} 
#' where terms with high sparsity are removed and documents without any terms are also removed
#' @export
#' @examples 
#' data(brussels_reviews_anno)
#' x <- subset(brussels_reviews_anno, xpos == "NN")
#' x <- x[, c("doc_id", "lemma")]
#' x <- document_term_frequencies(x)
#' dtm <- document_term_matrix(x)
#' 
#' 
#' ## Remove terms with low frequencies and documents with no terms
#' x <- dtm_remove_sparseterms(dtm, sparsity = 0.99)
#' dim(x)
#' x <- dtm_remove_sparseterms(dtm, sparsity = 0.99, remove_emptydocs = FALSE)
#' dim(x)
dtm_remove_sparseterms <- function(dtm, sparsity = 0.99, remove_emptydocs = TRUE){
  colfreq <- diff(dtm@p)
  sparseness <- 1 - (colfreq / nrow(dtm))
  keep <- which(sparseness < sparsity)
  if(length(keep) == 0){
    warning(sprintf("No terms which occur more than %s percent of the documents", 100*(1-sparsity)))
    if(remove_emptydocs){
      dtm <- dtm[0, 0, drop = FALSE]
    }else{
      dtm <- dtm[, 0, drop = FALSE]
    }
  }else{
    dtm <- dtm[, keep, drop = FALSE]
    if(remove_emptydocs){
      dtm <- dtm[Matrix::rowSums(dtm) > 0, , drop = FALSE]  
    }  
  }
  dtm
}



#' @title Term Frequency - Inverse Document Frequency calculation
#' @description Term Frequency - Inverse Document Frequency calculation.
#' Averaged by each term.
#' @param dtm an object returned by \code{\link{document_term_matrix}}
#' @return a vector with tfidf values, one for each term in the \code{dtm} matrix
#' @export
#' @examples 
#' data(brussels_reviews_anno)
#' x <- subset(brussels_reviews_anno, xpos == "NN")
#' x <- x[, c("doc_id", "lemma")]
#' x <- document_term_frequencies(x)
#' dtm <- document_term_matrix(x)
#' 
#' ## Calculate tfidf
#' tfidf <- dtm_tfidf(dtm)
#' hist(tfidf, breaks = "scott")
#' head(sort(tfidf, decreasing = TRUE))
#' head(sort(tfidf, decreasing = FALSE))
dtm_tfidf <- function(dtm){
  ## number of times word appears / number of words in document, on average if non-missing
  ## times log2(# documents / #documentsXwords)
  terms <- colnames(dtm)
  m <- Matrix::summary(dtm)
  term_tfidf <- tapply(m$x/Matrix::rowSums(dtm)[m$i], m$j, mean) * 
    log2(nrow(dtm)/Matrix::colSums(dtm > 0))
  names(term_tfidf) <- terms
  term_tfidf
}

#' @title Remove terms from a Document-Term-Matrix and documents with no terms based on the term frequency inverse document frequency
#' @description Remove terms from a Document-Term-Matrix and documents with no terms based on the term frequency inverse document frequency.
#' Either giving in the maximum number of terms (argument \code{top}), the tfidf cutoff (argument \code{cutoff})
#' or a quantile (argument \code{prob})
#' @param dtm an object returned by \code{\link{document_term_matrix}} 
#' @param top integer with the number of terms which should be kept as defined by the highest mean tfidf
#' @param cutoff numeric cutoff value to keep only terms in \code{dtm} where the tfidf obtained by \code{dtm_tfidf} is higher than this value
#' @param prob numeric quantile indicating to keep only terms in \code{dtm} where the tfidf obtained by \code{dtm_tfidf} is higher than 
#' the \code{prob} percent quantile
#' @param remove_emptydocs logical indicating to remove documents containing no more terms after the term removal is executed. Defaults to \code{TRUE}.
#' @return a sparse Matrix as returned by \code{sparseMatrix} 
#' where terms with high tfidf are kept and documents without any remaining terms are removed
#' @export
#' @examples 
#' data(brussels_reviews_anno)
#' x <- subset(brussels_reviews_anno, xpos == "NN")
#' x <- x[, c("doc_id", "lemma")]
#' x <- document_term_frequencies(x)
#' dtm <- document_term_matrix(x)
#' dtm <- dtm_remove_lowfreq(dtm, minfreq = 10)
#' dim(dtm)
#' 
#' ## Keep only terms with high tfidf
#' x <- dtm_remove_tfidf(dtm, top=50)
#' dim(x)
#' x <- dtm_remove_tfidf(dtm, top=50, remove_emptydocs = FALSE)
#' dim(x)
#' 
#' ## Keep only terms with tfidf above 1.1
#' x <- dtm_remove_tfidf(dtm, cutoff=1.1)
#' dim(x)
#' 
#' ## Keep only terms with tfidf above the 60 percent quantile
#' x <- dtm_remove_tfidf(dtm, prob=0.6)
#' dim(x)
dtm_remove_tfidf <- function(dtm, top, cutoff, prob, remove_emptydocs = TRUE){
  tfidf <- dtm_tfidf(dtm)
  if(!missing(top)){
    terms <- head(sort(tfidf, decreasing = TRUE), n = top)
    terms <- names(terms)
  }else if(!missing(cutoff)){
    terms <- tfidf[tfidf >= cutoff]
    terms <- names(terms)
  }else if(!missing(prob)){
    cutoff <- stats::quantile(tfidf, prob)
    terms <- tfidf[tfidf >= cutoff]
    terms <- names(terms)
  }else{
    stop("either provide top, cutoff or prob")
  }
  if(length(terms) == 0){
    stop("no terms found in reducing based on tfidf, consider increasing top or decreasing cutoff/prob")
  }
  dtm <- dtm[, which(colnames(dtm) %in% terms), drop = FALSE]
  if(remove_emptydocs){
    dtm <- dtm[Matrix::rowSums(dtm) > 0, , drop = FALSE]  
  }
  dtm
}


#' @title Remove terms from a Document-Term-Matrix and keep only documents which have a least some terms
#' @description Remove terms from a Document-Term-Matrix and keep only documents which have a least some terms
#' @param dtm an object returned by \code{\link{document_term_matrix}} 
#' @param terms a character vector of terms which are in \code{colnames(dtm)} and which should be removed
#' @param remove_emptydocs logical indicating to remove documents containing no more terms after the term removal is executed. Defaults to \code{TRUE}.
#' @return a sparse Matrix as returned by \code{sparseMatrix} 
#' where the indicated terms are removed as well as documents with no terms whatsoever
#' @export
#' @examples 
#' data(brussels_reviews_anno)
#' x <- subset(brussels_reviews_anno, xpos == "NN")
#' x <- x[, c("doc_id", "lemma")]
#' x <- document_term_frequencies(x)
#' dtm <- document_term_matrix(x)
#' dim(dtm)
#' x <- dtm_remove_terms(dtm, terms = c("appartement", "casa", "centrum", "ciudad"))
#' dim(x)
#' x <- dtm_remove_terms(dtm, terms = c("appartement", "casa", "centrum", "ciudad"), 
#'                       remove_emptydocs = FALSE)
#' dim(x)
dtm_remove_terms <- function(dtm, terms, remove_emptydocs = TRUE){
  dtm <- dtm[, which(!colnames(dtm) %in% terms), drop = FALSE]
  if(remove_emptydocs){
    dtm <- dtm[Matrix::rowSums(dtm) > 0, , drop = FALSE]  
  }
  dtm
}

#' @title Pearson Correlation for Sparse Matrices
#' @description Pearson Correlation for Sparse Matrices. 
#' More memory and time-efficient than \code{cor(as.matrix(x))}. 
#' @param x A matrix, potentially a sparse matrix such as a "dgCMatrix" object 
#' which is returned by \code{\link{document_term_matrix}}
#' @return a correlation matrix 
#' @seealso \code{\link{document_term_matrix}}
#' @export
#' @examples 
#' x <- data.frame(
#'  doc_id = c(1, 1, 2, 3, 4), 
#'  term = c("A", "C", "Z", "X", "G"), 
#'  freq = c(1, 5, 7, 10, 0))
#' dtm <- document_term_matrix(x)
#' dtm_cor(dtm)
dtm_cor <- function(x) {
  n <- nrow(x)
  covmat <- (as.matrix(Matrix::crossprod(x)) - n * Matrix::tcrossprod(Matrix::colMeans(x))) / (n - 1)
  cormat <- covmat / Matrix::tcrossprod(sqrt(Matrix::diag(covmat)))
  cormat
}


#' @title Combine 2 document term matrices either by rows or by columns
#' @description These 2 methods provide \code{\link{cbind}} and \code{\link{rbind}} functionality 
#' for sparse matrix objects which are returned by \code{\link{document_term_matrix}}. \cr
#' 
#' In case of \code{dtm_cbind}, if the rows are not ordered in the same way in x and y, it will order them based on the rownames. 
#' If there are missing rows these will be filled with NA values. \cr
#' In case of \code{dtm_rbind}, if the columns are not ordered in the same way in x and y, it will order them based on the colnames. 
#' If there are missing columns these will be filled with NA values.
#' @param x a sparse matrix such as a "dgCMatrix" object which is returned by \code{\link{document_term_matrix}}
#' @param y a sparse matrix such as a "dgCMatrix" object which is returned by \code{\link{document_term_matrix}}
#' @param ... more sparse matrices
#' @return a sparse matrix where either rows are put below each other in case of \code{dtm_rbind}
#' or columns are put next to each other in case of \code{dtm_cbind}
#' @seealso \code{\link{document_term_matrix}}
#' @name dtm_bind
#' @aliases dtm_rbind dtm_cbind
#' @export
#' @examples 
#' \dontshow{
#' data.table::setDTthreads(1)
#' }
#' data(brussels_reviews_anno)
#' x <- brussels_reviews_anno
#' 
#' ## rbind
#' dtm1 <- document_term_frequencies(x = subset(x, doc_id %in% c("10049756", "10284782")),
#'                                   document = "doc_id", term = "token")
#' dtm1 <- document_term_matrix(dtm1)
#' dtm2 <- document_term_frequencies(x = subset(x, doc_id %in% c("10789408", "12285061", "35509091")),
#'                                   document = "doc_id", term = "token")
#' dtm2 <- document_term_matrix(dtm2)
#' dtm3 <- document_term_frequencies(x = subset(x, doc_id %in% c("31133394", "36224131")),
#'                                   document = "doc_id", term = "token")
#' dtm3 <- document_term_matrix(dtm3)
#' m <- dtm_rbind(dtm1, dtm2)
#' dim(m)
#' m <- dtm_rbind(dtm1, dtm2, dtm3)
#' dim(m)
#' 
#' ## cbind
#' library(data.table)
#' x <- subset(brussels_reviews_anno, language %in% c("nl", "fr"))
#' x <- as.data.table(x)
#' x <- x[, token_bigram  := txt_nextgram(token, n = 2), by = list(doc_id, sentence_id)]
#' x <- x[, lemma_upos    := sprintf("%s//%s", lemma, upos)]
#' dtm1 <- document_term_frequencies(x = x, document = "doc_id", term = c("token"))
#' dtm1 <- document_term_matrix(dtm1)
#' dtm2 <- document_term_frequencies(x = x, document = "doc_id", term = c("token_bigram"))
#' dtm2 <- document_term_matrix(dtm2)
#' dtm3 <- document_term_frequencies(x = x, document = "doc_id", term = c("upos"))
#' dtm3 <- document_term_matrix(dtm3)
#' dtm4 <- document_term_frequencies(x = x, document = "doc_id", term = c("lemma_upos"))
#' dtm4 <- document_term_matrix(dtm4)
#' m <- dtm_cbind(dtm1, dtm2)
#' dim(m)
#' m <- dtm_cbind(dtm1, dtm2, dtm3, dtm4)
#' dim(m)
#' m <- dtm_cbind(dtm1[-c(100, 999), ], dtm2[-1000,])
#' dim(m)
dtm_cbind <- function(x, y, ...){
  ldots <- list(...)
  if(is.null(rownames(x))) stop("x needs to have rownames")
  if(is.null(rownames(y))) stop("y needs to have rownames")
  if(length(intersect(colnames(x), colnames(y))) > 0) stop("x and y should not have overlapping column names")
  rowsleft <- rownames(x)
  rowsright <- rownames(y)
  r <- union(rowsleft, rowsright)
  addleft <- setdiff(rowsright, rowsleft)
  if(length(addleft) > 0){
    addleft <- Matrix::sparseMatrix(i = integer(), j = integer(), x = NA,
                                    dims = c(length(addleft), ncol(x)), dimnames = list(addleft, colnames(x)))
    x <- methods::rbind2(x, addleft)
  }
  addright <- setdiff(rowsleft, rowsright)
  if(length(addright) > 0){
    addright <- Matrix::sparseMatrix(i = integer(), j = integer(), x = NA,
                                     dims = c(length(addright), ncol(y)), dimnames = list(addright, colnames(y)))
    y <- methods::rbind2(y, addright)
  }
  out <- cbind2(x[r, , drop = FALSE], y[r, , drop = FALSE])
  if(length(ldots) > 0){
    largs <- ldots[-1]
    largs$x <- out
    largs$y <- ldots[[1]]
    out <- do.call(dtm_cbind, largs)
  }
  out
}

#' @export
#' @rdname dtm_bind
dtm_rbind <- function(x, y, ...){
  ldots <- list(...)
  if(is.null(colnames(x))) stop("x needs to have colnames")
  if(is.null(colnames(y))) stop("y needs to have colnames")
  if(length(intersect(rownames(x), rownames(y))) > 0) stop("x and y should not have overlapping row names")
  colsleft <- colnames(x)
  colsright <- colnames(y)
  r <- union(colsleft, colsright)
  addleft <- setdiff(colsright, colsleft)
  if(length(addleft) > 0){
    addleft <- Matrix::sparseMatrix(i = integer(), j = integer(), x = NA,
                                    dims = c(nrow(x), length(addleft)), dimnames = list(rownames(x), addleft))
    x <- methods::cbind2(x, addleft)
  }
  addright <- setdiff(colsleft, colsright)
  if(length(addright) > 0){
    addright <- Matrix::sparseMatrix(i = integer(), j = integer(), x = NA,
                                     dims = c(nrow(y), length(addright)), dimnames = list(rownames(y), addright))
    y <- methods::cbind2(y, addright)
  }
  out <- rbind2(x[, r, drop = FALSE], y[, r, drop = FALSE])
  if(length(ldots) > 0){
    largs <- ldots[-1]
    largs$x <- out
    largs$y <- ldots[[1]]
    out <- do.call(dtm_rbind, largs)
  }
  out
}


#' @title Column sums and Row sums for document term matrices
#' @description Column sums and Row sums for document term matrices
#' @param dtm an object returned by \code{\link{document_term_matrix}}
#' @param groups optionally, a list with column/row names or column/row indexes of the \code{dtm} which should be combined by 
#' taking the sum over the rows or columns of these. See the examples
#' @return 
#' Returns either a vector in case argument \code{groups} is not provided or a sparse matrix of class \code{dgCMatrix}
#' in case argument \code{groups} is provided
#' \itemize{
#' \item{in case \code{groups} is not provided: a vector of row/column sums with corresponding names}
#' \item{in case \code{groups} is provided: a sparse matrix containing summed information over the groups of rows/columns}
#' }
#' @export
#' @aliases dtm_colsums dtm_rowsums
#' @examples 
#' x <- data.frame(
#'  doc_id = c(1, 1, 2, 3, 4), 
#'  term = c("A", "C", "Z", "X", "G"), 
#'  freq = c(1, 5, 7, 10, 0))
#' dtm <- document_term_matrix(x)
#' x <- dtm_colsums(dtm)
#' x
#' x <- dtm_rowsums(dtm)
#' head(x)
#' 
#' ## 
#' ## Grouped column summation
#' ## 
#' x <- list(doc1 = c("aa", "bb", "aa", "b"), doc2 = c("bb", "bb", "BB"))
#' dtm <- document_term_matrix(x)
#' dtm
#' dtm_colsums(dtm, groups = list(combinedB = c("b", "bb"), combinedA = c("aa", "A")))
#' dtm_colsums(dtm, groups = list(combinedA = c("aa", "A")))
#' dtm_colsums(dtm, groups = list(
#'   combinedB = grep(pattern = "b", colnames(dtm), ignore.case = TRUE, value = TRUE), 
#'   combinedA = c("aa", "A", "ZZZ"),
#'   test      = character()))
#' dtm_colsums(dtm, groups = list())
#' 
#' ## 
#' ## Grouped row summation
#' ## 
#' x <- list(doc1 = c("aa", "bb", "aa", "b"), 
#'           doc2 = c("bb", "bb", "BB"),
#'           doc3 = c("bb", "bb", "BB"),
#'           doc4 = c("bb", "bb", "BB", "b"))
#' dtm <- document_term_matrix(x)
#' dtm
#' dtm_rowsums(dtm, groups = list(doc1 = "doc1", combi = c("doc2", "doc3", "doc4")))
#' dtm_rowsums(dtm, groups = list(unknown = "docUnknown", combi = c("doc2", "doc3", "doc4")))
#' dtm_rowsums(dtm, groups = list())
dtm_colsums <- function(dtm, groups){
  if(missing(groups)){
    Matrix::colSums(dtm)
  }else{
    dtm_colsums_group(dtm, groups)
  }
}

#' @export
#' @rdname dtm_colsums
dtm_rowsums <- function(dtm, groups){
  if(missing(groups)){
    Matrix::rowSums(dtm)
  }else{
    dtm_rowsums_group(dtm, groups)
  }
}


dtm_colsums_group <- function(x, groups){
  stopifnot(is.list(groups))
  if(length(groups) == 0){
     extra <- matrix(nrow = nrow(x), ncol = 0, byrow = FALSE, dimnames = list(rownames(x)))
     extra <- document_term_matrix.matrix(extra, "dgCMatrix")
     return(extra)
  }
  #stopifnot(length(groups) > 0)
  #stopifnot(all(sapply(groups, is.character)))
  if(is.null(names(groups))){
    stop("groups should have names")
  }
  x_fields <- colnames(x)
  groups <- lapply(groups, FUN=function(x){
    if(is.character(x)){
      x
    }else if(is.integer(x) | is.numeric(x)){
      x_fields[x]
    }else{
      stop("groups should contain a list of column names")
    }
  })
  extra <- mapply(fields = groups, column_j = seq_along(groups), FUN=function(fields, column_j){
    fields <- which(x_fields %in% fields)
    if(length(fields) == 0){
      list(i = integer(), j = integer(), x = numeric())
    }else{
      values <- dtm_rowsums(x[, fields, drop = FALSE])
      idx <- which(values != 0)
      list(i = idx, j = rep(column_j, length(idx)), x = values[idx])
    } 
  }, SIMPLIFY = FALSE)
  extra <- data.table::rbindlist(extra)
  extra <- Matrix::sparseMatrix(dims = c(nrow(x), length(groups)), 
                                dimnames = list(rownames(x), names(groups)), 
                                i = extra$i, j = extra$j, x = extra$x)
  extra
}


dtm_rowsums_group <- function(x, groups){
  stopifnot(is.list(groups))
  if(length(groups) == 0){
     extra <- matrix(nrow = 0, ncol = ncol(x), byrow = FALSE, dimnames = list(character(), colnames(x)))
     extra <- document_term_matrix.matrix(extra, "dgCMatrix")
     return(extra)
  }
  #stopifnot(length(groups) > 0)
  #stopifnot(all(sapply(groups, is.character)))
  if(is.null(names(groups))){
    stop("groups should have names")
  }
  x_rows <- rownames(x)
  groups <- lapply(groups, FUN=function(x){
    if(is.character(x)){
      x
    }else if(is.integer(x) | is.numeric(x)){
      x_rows[x]
    }else{
      stop("groups should contain a list of row names")
    }
  })
  extra <- mapply(rows = groups, row_i = seq_along(groups), FUN=function(rows, row_i){
    rows <- which(x_rows %in% rows)
    if(length(rows) == 0){
      list(i = integer(), j = integer(), x = numeric())
    }else{
      values <- dtm_colsums(x[rows, , drop = FALSE])
      idx <- which(values != 0)
      list(i = rep(row_i, length(idx)), j = idx, x = values[idx])
    } 
  }, SIMPLIFY = FALSE)
  extra <- data.table::rbindlist(extra)
  extra <- Matrix::sparseMatrix(dims = c(length(groups), ncol(x)), 
                                dimnames = list(names(groups), colnames(x)), 
                                i = extra$i, j = extra$j, x = extra$x)
  extra
}




#' @title Compare term usage across 2 document groups using the Chi-square Test for Count Data
#' @description Perform a \code{\link{chisq.test}} to compare if groups of documents have more prevalence of specific terms.\cr
#' The function looks to each term in the document term matrix and applies a \code{\link{chisq.test}} comparing the frequency 
#' of occurrence of each term compared to the other terms in the document group.
#' @param dtm a document term matrix: an object returned by \code{\link{document_term_matrix}}
#' @param groups a logical vector with 2 groups (TRUE / FALSE) where the size of the \code{groups} vector 
#' is the same as the number of rows of \code{dtm} and where element i corresponds row i of \code{dtm}
#' @param correct passed on to \code{\link{chisq.test}}
#' @param ... further arguments passed on to \code{\link{chisq.test}}
#' @export
#' @return a data.frame with columns term, chisq, p.value, freq, freq_true, freq_false indicating for each term in the \code{dtm},
#' how frequently it occurs in each group, the Chi-Square value and it's corresponding p-value.
#' @examples 
#' data(brussels_reviews_anno)
#' ##
#' ## Which nouns occur in text containing the term 'centre'
#' ##
#' x <- subset(brussels_reviews_anno, xpos == "NN" & language == "fr")
#' x <- x[, c("doc_id", "lemma")]
#' x <- document_term_frequencies(x)
#' dtm <- document_term_matrix(x)
#' relevant <- dtm_chisq(dtm, groups = dtm[, "centre"] > 0)
#' head(relevant, 10)
#' 
#' ##
#' ## Which adjectives occur in text containing the term 'hote'
#' ##
#' x <- subset(brussels_reviews_anno, xpos == "JJ" & language == "fr")
#' x <- x[, c("doc_id", "lemma")]
#' x <- document_term_frequencies(x)
#' dtm <- document_term_matrix(x)
#' 
#' group <- subset(brussels_reviews_anno, lemma %in% "hote")
#' group <- rownames(dtm) %in% group$doc_id
#' relevant <- dtm_chisq(dtm, groups = group)
#' head(relevant, 10)
#' 
#' 
#' \dontrun{
#' # do not show scientific notation of the p-values
#' options(scipen = 100)
#' head(relevant, 10)
#' }
dtm_chisq <- function(dtm, groups, correct = TRUE, ...){
  stopifnot(is.logical(groups))
  stopifnot(length(unique(groups)) == 2)
  stopifnot(length(groups) == nrow(dtm))
  recode <- function(x, from, to){
    to[match(x, from)]
  }
  
  DTM <- dtm_reverse(dtm)
  DTM <- data.frame(doc_id = recode(DTM$doc_id, from = rownames(dtm), to = groups), 
                    term = DTM$term, 
                    freq = DTM$freq, stringsAsFactors = FALSE)
  DTM <- document_term_frequencies(DTM)
  DTM <- document_term_matrix(DTM)
  
  freq_true  <- DTM["TRUE" , , drop = TRUE]
  freq_false <- DTM["FALSE", , drop = TRUE]
  
  contingencies <- data.frame(term = colnames(DTM), 
                              term_freq_true  = freq_true, 
                              term_freq_false = freq_false,
                              otherterms_freq_true  = sum(freq_true) - freq_true, 
                              otherterms_freq_flase = sum(freq_false) - freq_false, 
                              stringsAsFactors = FALSE)
  result <- mapply(term = contingencies$term, 
                   a = contingencies$term_freq_true, b = contingencies$term_freq_false, 
                   c = contingencies$otherterms_freq_true, d = contingencies$otherterms_freq_flase, 
                   FUN = function(term, a, b, c, d){
                     tab <- matrix(c(a, b, c, d), nrow = 2, ncol = 2, byrow = TRUE)
                     suppressWarnings(result <- chisq.test(tab, correct = correct, ...))
                     list(term = term,
                          chisq = result$statistic, 
                          p.value = result$p.value,
                          freq = a + b,
                          freq_true = a, 
                          freq_false = b)
                   }, SIMPLIFY = FALSE)
  result <- data.table::rbindlist(result)
  result <- result[order(result$chisq, decreasing = TRUE), ]
  result <- data.table::setDF(result)
  result
}



#' @title Make sure a document term matrix has exactly the specified rows and columns
#' @description Makes sure the document term matrix has exactly the rows and columns which you specify. If missing rows or columns
#' are occurring, the function fills these up either with empty cells or with the value that you provide. See the examples.
#' @param dtm a document term matrix: an object returned by \code{\link{document_term_matrix}}
#' @param rows a character vector of row names which \code{dtm} should have
#' @param columns a character vector of column names which \code{dtm} should have
#' @param fill a value to use to fill up missing rows / columns. Defaults to using an empty cell.
#' @return the sparse matrix \code{dtm} with exactly the specified rows and columns
#' @export
#' @seealso \code{\link{document_term_matrix}}
#' @examples 
#' x <- data.frame(doc_id = c("doc_1", "doc_1", "doc_1", "doc_2"), 
#'                 text = c("a", "a", "b", "c"), 
#'                 stringsAsFactors = FALSE)
#' dtm <- document_term_frequencies(x)
#' dtm <- document_term_matrix(dtm)
#' dtm
#' dtm_conform(dtm, 
#'             rows = c("doc_1", "doc_2", "doc_3"), columns = c("a", "b", "c", "Z", "Y"))
#' dtm_conform(dtm, 
#'             rows = c("doc_1", "doc_2", "doc_3"), columns = c("a", "b", "c", "Z", "Y"), 
#'             fill = 1)
#' dtm_conform(dtm, rows = c("doc_1", "doc_3"), columns = c("a", "b", "c", "Z", "Y"))
#' dtm_conform(dtm, columns = c("a", "b", "Z"))
#' dtm_conform(dtm, rows = c("doc_1"))
#' dtm_conform(dtm, rows = character())
#' dtm_conform(dtm, columns = character())
#' dtm_conform(dtm, rows = character(), columns = character())
#' 
#' ##
#' ## Some examples on border line cases
#' ##
#' special1 <- dtm[, character()]
#' special2 <- dtm[character(), character()]
#' special3 <- dtm[character(), ]
#' 
#' dtm_conform(special1, 
#'             rows = c("doc_1", "doc_2", "doc_3"), columns = c("a", "b", "c", "Z", "Y"))
#' dtm_conform(special1, 
#'             rows = c("doc_1", "doc_2", "doc_3"), columns = c("a", "b", "c", "Z", "Y"), 
#'             fill = 1)
#' dtm_conform(special1, rows = c("doc_1", "doc_3"), columns = c("a", "b", "c", "Z", "Y"))
#' dtm_conform(special1, columns = c("a", "b", "Z"))
#' dtm_conform(special1, rows = c("doc_1"))
#' dtm_conform(special1, rows = character())
#' dtm_conform(special1, columns = character())
#' dtm_conform(special1, rows = character(), columns = character())
#' 
#' dtm_conform(special2, 
#'             rows = c("doc_1", "doc_2", "doc_3"), columns = c("a", "b", "c", "Z", "Y"))
#' dtm_conform(special2, 
#'             rows = c("doc_1", "doc_2", "doc_3"), columns = c("a", "b", "c", "Z", "Y"), 
#'             fill = 1)
#' dtm_conform(special2, rows = c("doc_1", "doc_3"), columns = c("a", "b", "c", "Z", "Y"))
#' dtm_conform(special2, columns = c("a", "b", "Z"))
#' dtm_conform(special2, rows = c("doc_1"))
#' dtm_conform(special2, rows = character())
#' dtm_conform(special2, columns = character())
#' dtm_conform(special2, rows = character(), columns = character())
#' 
#' dtm_conform(special3, 
#'             rows = c("doc_1", "doc_2", "doc_3"), columns = c("a", "b", "c", "Z", "Y"))
#' dtm_conform(special3, 
#'             rows = c("doc_1", "doc_2", "doc_3"), columns = c("a", "b", "c", "Z", "Y"), 
#'             fill = 1)
#' dtm_conform(special3, rows = c("doc_1", "doc_3"), columns = c("a", "b", "c", "Z", "Y"))
#' dtm_conform(special3, columns = c("a", "b", "Z"))
#' dtm_conform(special3, rows = c("doc_1"))
#' dtm_conform(special3, rows = character())
#' dtm_conform(special3, columns = character())
#' dtm_conform(special3, rows = character(), columns = character())
dtm_conform <- function(dtm, rows, columns, fill){
  if(!missing(columns)){
    missing_terms <- setdiff(columns, colnames(dtm))
    if(length(missing_terms)){
      if(!missing(fill)){
        extra <- expand.grid(i = seq_len(nrow(dtm)), j = length(missing_terms))
        extra <- Matrix::sparseMatrix(dims = c(nrow(dtm), length(missing_terms)), dimnames = list(rownames(dtm), missing_terms), i = extra$i, j = extra$j, x = fill)
      }else{
        extra <- Matrix::sparseMatrix(dims = c(nrow(dtm), length(missing_terms)), dimnames = list(rownames(dtm), missing_terms), i = {}, j = {})  
      }
      if(nrow(dtm) == 0){
        dtm <- methods::cbind2(dtm, extra)  
      }else{
        dtm <- dtm_cbind(dtm, extra)
      }
    }  
  }
  if(!missing(rows)){
    missing_docs <- setdiff(rows, rownames(dtm))
    if(length(missing_docs) > 0){
      if(!missing(fill)){
        extra <- expand.grid(i = seq_len(length(missing_docs)), j = ncol(dtm))
        extra <- Matrix::sparseMatrix(dims = c(length(missing_docs), ncol(dtm)), dimnames = list(missing_docs, colnames(dtm)), i = extra$i, j = extra$j, x = fill)
      }else{
        extra <- Matrix::sparseMatrix(dims = c(length(missing_docs), ncol(dtm)), dimnames = list(missing_docs, colnames(dtm)), i = {}, j = {})
      }
      if(ncol(dtm) == 0){
        dtm <- methods::rbind2(dtm, extra)  
      }else{
        dtm <- udpipe::dtm_rbind(dtm, extra)  
      }
    }  
  }
  if(!missing(rows) & !missing(columns)){
    dtm <- dtm[rows, columns, drop = FALSE]  
  }else if(!missing(rows)){
    dtm <- dtm[rows, , drop = FALSE]  
  }else if(!missing(columns)){
    dtm <- dtm[, columns, drop = FALSE]  
  }
  dtm
}


#' @title Reorder a Document-Term-Matrix alongside a vector or data.frame 
#' @description This utility function is useful to align a Document-Term-Matrix with 
#' information in a data.frame or a vector to predict, such that both the predictive information as well as the target 
#' is available in the same order. \cr
#' Matching is done based on the identifiers in the rownames of \code{x} and either the names of the \code{y} vector 
#' or the first column of \code{y} in case it is a data.frame.
#' @param x a Document-Term-Matrix of class dgCMatrix (which can be an object returned by \code{\link{document_term_matrix}})
#' @param y either a vector or data.frame containing something to align with \code{x} (e.g. for predictive purposes).
#' \itemize{
#' \item{In case \code{y} is a vector, it should have names which are available in the rownames of \code{x}.}
#' \item{In case \code{y} is a data.frame, it's first column should contain identifiers which are available in the rownames of \code{x}.}
#' }
#' @param FUN a function to be applied on \code{x} before aligning it to \code{y}. See the examples
#' @param ... further arguments passed on to FUN
#' @return a list with elements \code{x} and \code{y} containing the document term matrix \code{x} in the same order as \code{y}.
#' \itemize{
#' \item{If in \code{y} a vector was passed, the returned \code{y} element will be a vector}
#' \item{If in \code{y} a data.frame was passed with more than 2 columns, the returned \code{y} element will be a data.frame}
#' \item{If in \code{y} a data.frame was passed with exactly 2 columns, the returned \code{y} element will be a vector}
#' }
#' Only returns data of \code{x} with overlapping identifiers in \code{y}.
#' @export
#' @seealso \code{\link{document_term_matrix}}
#' @examples 
#' x <- matrix(1:9, nrow = 3, dimnames = list(c("a", "b", "c")))
#' x
#' dtm_align(x = x, 
#'           y = c(b = 1, a = 2, c = 6, d = 6))
#' dtm_align(x = x, 
#'           y = c(b = 1, a = 2, c = 6, d = 6, d = 7, a = -1))
#'           
#' data(brussels_reviews)
#' data(brussels_listings)
#' x <- brussels_reviews
#' x <- strsplit.data.frame(x, term = "feedback", group = "listing_id")
#' x <- document_term_frequencies(x)
#' x <- document_term_matrix(x)
#' y <- brussels_listings$price
#' names(y) <- brussels_listings$listing_id
#' 
#' ## align a matrix of predictors with a vector to predict
#' trainset <- dtm_align(x = x, y = y)
#' trainset <- dtm_align(x = x, y = y, FUN = function(dtm){
#'   dtm <- dtm_remove_lowfreq(dtm, minfreq = 5)
#'   dtm <- dtm_sample(dtm)
#'   dtm
#' })
#' head(names(y))
#' head(rownames(x))
#' head(names(trainset$y))
#' head(rownames(trainset$x))
#' 
#' ## align a matrix of predictors with a data.frame
#' trainset <- dtm_align(x = x, y = brussels_listings[, c("listing_id", "price")])
#' trainset <- dtm_align(x = x, 
#'                 y = brussels_listings[, c("listing_id", "price", "room_type")])
#' head(trainset$y$listing_id)
#' head(rownames(trainset$x))
#' 
#' ## example with duplicate data in case of data balancing
#' dtm_align(x = matrix(1:30, nrow = 3, dimnames = list(c("a", "b", "c"))), 
#'           y = c(a = 1, a = 2, b = 3, d = 6, b = 6))
#' target   <- subset(brussels_listings, listing_id %in% brussels_reviews$listing_id)
#' target   <- rbind(target[1:3, ], target[c(2, 3), ], target[c(1, 4), ])
#' trainset <- dtm_align(x = x, y = target[, c("listing_id", "price")])
#' trainset <- dtm_align(x = x, y = setNames(target$price, target$listing_id))
#' names(trainset$y)
#' rownames(trainset$x)
dtm_align <- function(x, y, FUN, ...){
  if(!inherits(x, c("dgCMatrix", "matrix"))){
    warning(sprintf("expecting x to be of class dgCMatrix, while you passed a %s", paste(class(x), collapse = " ")))
  }
  if(!missing(FUN)){
    x <- FUN(x, ...)
  }
  if(is.data.frame(y)){
    if(ncol(y) < 2){
      stop("y is a data.frame, it should have at least 2 columns, where the first is a document id")
    }
    nm <- y[[1]]
    #y <- y[[2]]
    #names(y) <- nm
  }else if(is.vector(y) | is.factor(y)){
    nm <- names(y)
    if(is.null(nm)){
      stop("y is required to be a vector which has names otherwise we can not align it with the rownames of x")
    }
  }else{
    stop("dtm_match is only implemented for y of type data.frame or with vectors")
  }
  X   <- x[which(rownames(x) %in% nm), , drop = FALSE]
  #idx <- match(rownames(X), nm)
  idx <- which(nm %in% rownames(X))
  if(NCOL(y) == 1){
    Y <- y[idx]
  }else if(NCOL(y) == 2){
    Y <- y[[2]]
    names(Y) <- nm
    Y <- Y[idx]
  }else{
    Y <- y[idx, , drop = FALSE]
  }
  X   <- X[match(nm[idx], rownames(X)), , drop = FALSE]
  structure(list(x = X, y = Y), class = "dtm_aligned")
}

#' @title Random samples and permutations from a Document-Term-Matrix
#' @description Sample the specified number of rows from the Document-Term-Matrix using either with or without replacement. 
#' @param dtm a document term matrix of class dgCMatrix (which can be an object returned by \code{\link{document_term_matrix}})
#' @param size a positive number, the number of rows to sample
#' @param replace should sampling be with replacement
#' @param prob a vector of probability weights, one for each row of \code{x}
#' @export
#' @return \code{dtm} with as many rows as specified in \code{size}
#' @examples 
#' x <- list(doc1 = c("aa", "bb", "cc", "aa", "b"), 
#'           doc2 = c("bb", "bb", "dd", ""), 
#'           doc3 = character(),
#'           doc4 = c("cc", NA), 
#'           doc5 = character())
#' dtm <- document_term_matrix(x)
#' dtm_sample(dtm, size = 2)
#' dtm_sample(dtm, size = 3)
#' dtm_sample(dtm, size = 2)
#' dtm_sample(dtm, size = 8, replace = TRUE)
#' dtm_sample(dtm, size = 8, replace = TRUE, prob = c(1, 1, 0.01, 0.5, 0.01))
dtm_sample <- function(dtm, size = nrow(dtm), replace = FALSE, prob = NULL){
  idx <- sample.int(n = nrow(dtm), size = size, replace = replace, prob = prob)
  dtm[idx, ]
}



#' @title Semantic Similarity to a Singular Value Decomposition
#' @description Calculate the similarity of a document term matrix to a set of terms based on 
#' a Singular Value Decomposition (SVD) embedding matrix.\cr
#' This can be used to easily construct a sentiment score based on the latent scale defined by a set of positive or negative terms.
#' @param dtm a sparse matrix such as a "dgCMatrix" object which is returned by \code{\link{document_term_matrix}} containing frequencies of terms for each document
#' @param embedding a matrix containing the \code{v} element from an singular value decomposition with the right singular vectors. 
#' The rownames of that matrix should contain terms which are available in the \code{colnames(dtm)}. See the examples.
#' @param weights a numeric vector with weights giving your definition of which terms are positive or negative, 
#' The names of this vector should be terms available in the rownames of the embedding matrix. See the examples.
#' @param terminology a character vector of terms to limit the calculation of the similarity for the \code{dtm} to the linear combination of the weights. 
#' Defaults to all terms from the \code{embedding} matrix.
#' @param type either 'cosine' or 'dot' indicating to respectively calculate cosine similarities or inner product similarities between the \code{dtm} and the SVD embedding space. Defaults to 'cosine'.
#' @export
#' @return an object of class 'svd_similarity' which is a list with elements
#' \itemize{
#' \item weights: The weights used. These are scaled to sum up to 1 as well on the positive as the negative side
#' \item type: The type of similarity calculated (either 'cosine' or 'dot')
#' \item terminology: A data.frame with columns term, freq and similarity where similarity indicates 
#' the similarity between the term and the SVD embedding space of the weights and freq is how frequently the term occurs in the \code{dtm}. 
#' This dataset is sorted in descending order by similarity.
#' \item similarity: A data.frame with columns doc_id and similarity indicating the similarity between
#' the \code{dtm} and the SVD embedding space of the weights. The doc_id is the identifier taken from the rownames of \code{dtm}.
#' \item scale: A list with elements terminology and weights 
#' indicating respectively the similarity in the SVD embedding space
#' between the \code{terminology} and each of the weights and between the weight terms itself
#' }
#' @seealso \url{https://en.wikipedia.org/wiki/Latent_semantic_analysis}
#' @examples
#' data("brussels_reviews_anno", package = "udpipe")
#' x <- subset(brussels_reviews_anno, language %in% "nl" & (upos %in% "ADJ" | lemma %in% "niet"))
#' dtm <- document_term_frequencies(x, document = "doc_id", term = "lemma")
#' dtm <- document_term_matrix(dtm)
#' dtm <- dtm_remove_lowfreq(dtm, minfreq = 3)
#' 
#' ## Function performing Singular Value Decomposition on sparse/dense data
#' dtm_svd <- function(dtm, dim = 5, type = c("RSpectra", "svd"), ...){
#'   type <- match.arg(type)
#'   if(type == "svd"){
#'     SVD <- svd(dtm, nu = 0, nv = dim, ...)
#'   }else if(type == "RSpectra"){
#'     #Uncomment this if you want to use the faster sparse SVD by RSpectra
#'     #SVD <- RSpectra::svds(dtm, nu = 0, k = dim, ...)
#'   }
#'   rownames(SVD$v) <- colnames(dtm)
#'   SVD$v
#' }
#' #embedding <- dtm_svd(dtm, dim = 5)
#' embedding <- dtm_svd(dtm, dim = 5, type = "svd")
#' 
#' ## Define positive / negative terms and calculate the similarity to these
#' weights <- setNames(c(1, 1, 1, 1, -1, -1, -1, -1),
#'                     c("fantastisch", "schoon", "vriendelijk", "net",
#'                       "lawaaiig", "lastig", "niet", "slecht"))
#' scores <- dtm_svd_similarity(dtm, embedding = embedding, weights = weights)
#' scores
#' str(scores$similarity)
#' hist(scores$similarity$similarity)
#' 
#' plot(scores$terminology$similarity_weight, log(scores$terminology$freq), 
#'      type = "n")
#' text(scores$terminology$similarity_weight, log(scores$terminology$freq), 
#'      labels = scores$terminology$term)
#'      
#' \dontrun{
#' ## More elaborate example using word2vec
#' ## building word2vec model on all Dutch texts, 
#' ## finding similarity of dtm to adjectives only
#' set.seed(123)
#' library(word2vec)
#' text      <- subset(brussels_reviews_anno, language == "nl")
#' text      <- paste.data.frame(text, term = "lemma", group = "doc_id")
#' text      <- text$lemma
#' model     <- word2vec(text, dim = 10, iter = 20, type = "cbow", min_count = 1)
#' predict(model, newdata = names(weights), type = "nearest", top_n = 3)
#' embedding <- as.matrix(model)
#' }
#' data(brussels_reviews_w2v_embeddings_lemma_nl)
#' embedding <- brussels_reviews_w2v_embeddings_lemma_nl
#' adjective <- subset(brussels_reviews_anno, language %in% "nl" & upos %in% "ADJ")
#' adjective <- txt_freq(adjective$lemma)
#' adjective <- subset(adjective, freq >= 5 & nchar(key) > 1)
#' adjective <- adjective$key
#' 
#' scores    <- dtm_svd_similarity(dtm, embedding, weights = weights, type = "dot", 
#'                                 terminology = adjective)
#' scores
#' plot(scores$terminology$similarity_weight, log(scores$terminology$freq), 
#'      type = "n")
#' text(scores$terminology$similarity_weight, log(scores$terminology$freq), 
#'      labels = scores$terminology$term, cex = 0.8)
dtm_svd_similarity <- function(dtm, embedding, weights, terminology = rownames(embedding), type = c("cosine", "dot")){
  doc_id <- term <- prop <- in_terminology <- NULL
  embedding_similarity <- function(x, y, type = c("cosine", "dot")) {
    if(!is.matrix(x)){
      x <- matrix(x, nrow = 1)
    }
    if(!is.matrix(y)){
      y <- matrix(y, nrow = 1)
    }
    type <- match.arg(type)
    
    if(type == "dot"){
      similarities <- tcrossprod(x, y)
    }else if(type == "cosine"){
      normx <- sqrt(rowSums(x^2))
      normy <- sqrt(rowSums(y^2))
      similarities <- tcrossprod(x, y) / (normx %o% normy)
    }
    similarities
  }
  setNames <- function(x, labels){
    names(x) <- labels
    x
  }
  ##
  ## DATA CHECKS
  ##
  type <- match.arg(type)
  stopifnot(is.matrix(embedding))
  stopifnot(is.vector(weights))
  
  object <- list(dim = ncol(embedding),
                 terminology = rownames(embedding),
                 embedding = as.matrix(embedding))
  
  if(missing(terminology)){
    terminology <- object$terminology
  }else{
    missing_embeddings <- setdiff(terminology, object$terminology)
    if(length(missing_embeddings) > 0){
      warning(sprintf("Removing '%s' from terminology as these are not part of rownames(embedding)", paste(missing_embeddings, collapse = ", ")))
    }
    terminology <- intersect(terminology, object$terminology)
  }
  not_known_weights <- setdiff(names(weights), object$terminology)
  if(length(not_known_weights) > 0){
    warning(sprintf("Removing '%s' from weights as these are not part of rownames(embedding)", paste(not_known_weights, collapse = ", ")))
    weights <- weights[!names(weights) %in% not_known_weights]
  }
  stopifnot(length(weights) > 0)
  if(length(unique(sign(weights))) != 2){
    stop("weights should contain positive as well as negative values")
  }
  
  ## rescale weights to make sure positive values sum to 1 and negative also sum to -1
  weights_scaled <- data.frame(term = names(weights), 
                               weight = weights, 
                               sign = sign(weights),
                               stringsAsFactors = FALSE)
  weights_scaled <- lapply(X = split(weights_scaled, weights_scaled$sign),
                           FUN = function(x){
                             x$weight_scaled = x$weight / abs(sum(x$weight))
                             x
                           })
  weights_scaled <- do.call(rbind, weights_scaled)
  weights_scaled <- setNames(weights_scaled$weight_scaled, weights_scaled$term)
  #weights_scaled <- setNames(weights_scaled$weight / nrow(weights_scaled), weights_scaled$term)
  
  
  ####################################################################################
  ## Calculate embedding similarity of terminology to weights, weights-to-weights
  ## and the embedding space of the linear combination of the weights
  ##
  weightsterms <- names(weights_scaled)
  
  similarity_terminology_to_weights <- embedding_similarity(
    object$embedding[terminology, , drop = FALSE],
    object$embedding[weightsterms, , drop = FALSE],
    type = type)

  similarity_weights_to_weights <- embedding_similarity(
    object$embedding[weightsterms, , drop = FALSE],
    object$embedding[weightsterms, , drop = FALSE],
    type = type)
  
  ## weighted term-to-weights similarity reflecting a scale on which -1 to +1 range lies
  weightspace <- similarity_terminology_to_weights %*% weights_scaled
  weightspace <- weightspace[, 1]
  
  ##
  ## Predict (for terms part of the terminology, get the weights embedding space)
  ##
  freq <- dtm_colsums(dtm)
  dtf <- dtm_reverse(dtm)
  dtf <- data.table::setDT(dtf)
  dtf <- dtf[, in_terminology := term %in% terminology, ]
  #dtf <- dtf[, prop := as.numeric(freq / sum(freq)), by = list(doc_id)]
  dtf <- dtf[, prop := as.numeric(ifelse(any(in_terminology), freq[in_terminology] / sum(freq[in_terminology]), 0)), by = list(doc_id)]
  dtm <- document_term_matrix(dtf, weight = "prop", vocabulary = terminology)
  dtm <- dtm[, terminology, drop = FALSE]
  
  scores <- dtm %*% weightspace
  scores <- scores[, 1]
  scores <- data.frame(doc_id = names(scores), 
                       similarity = as.numeric(scores), 
                       stringsAsFactors = FALSE)
  
  terminology_similarity <- sort(weightspace, decreasing = TRUE)
  terminology_similarity <- data.frame(
    term = names(terminology_similarity), 
    freq = txt_recode(names(terminology_similarity), from = names(freq), to = as.integer(freq), na.rm = TRUE),
    similarity_weight = as.numeric(terminology_similarity),
    stringsAsFactors = FALSE)
  terminology_similarity$freq <- ifelse(is.na(terminology_similarity$freq), 0, terminology_similarity$freq)
  
  result <- list(weights = weights_scaled,
                 type = type,
                 terminology = terminology_similarity,
                 similarity = scores,
                 scale = list(
                   terminology = similarity_terminology_to_weights,
                   weights = similarity_weights_to_weights))
  class(result) <- "svd_similarity"
  result
}

#' @export
print.svd_similarity <- function(x, n = 7, digits = 2, ...){
  cat(sprintf("Latent Semantic Scaling using %s similarity on SVD matrix", x$type), sep = "\n")
  cat(sprintf("Weights: %s", paste(sprintf("%s %s", names(x$weights), round(x$weights, 4)), collapse = ", ")), sep = "\n")
  cat(sprintf("Top %s most similar terms on the high end of the scale", n), sep = "\n")
  elements <- head(x$terminology, n = n, sep = "\n")
  mapply(term = elements$term, freq = elements$freq, similarity = elements$similarity, FUN=function(term, freq, similarity){
    cat(sprintf(" - %s %s (frequency: %s)", term, round(similarity, digits = digits), freq), sep = "\n")
  })
  cat(sprintf("Top %s most similar terms on the low end of the scale", n), sep = "\n")
  elements <- tail(x$terminology, n = n, sep = "\n")
  elements <- elements[order(elements$similarity, decreasing = FALSE), ]
  mapply(term = elements$term, freq = elements$freq, similarity = elements$similarity, FUN=function(term, freq, similarity){
    cat(sprintf(" - %s %s (frequency: %s)", term, round(similarity, digits = digits), freq), sep = "\n")
  })
  invisible()
}

Try the udpipe package in your browser

Any scripts or data that you put into this service are public.

udpipe documentation built on Jan. 6, 2023, 5:06 p.m.