document_term_matrix | R Documentation |
Create a document/term matrix from either
a data.frame with 1 row per document/term as returned by document_term_frequencies
a list of tokens from e.g. from package sentencepiece, tokenizers.bpe or just by using strsplit
an object of class DocumentTermMatrix or TermDocumentMatrix from the tm package
an object of class simple_triplet_matrix from the slam package
a regular dense matrix
document_term_matrix(x, vocabulary, weight = "freq", ...) ## S3 method for class 'data.frame' document_term_matrix(x, vocabulary, weight = "freq", ...) ## S3 method for class 'matrix' document_term_matrix(x, ...) ## S3 method for class 'integer' document_term_matrix(x, ...) ## S3 method for class 'numeric' document_term_matrix(x, ...) ## Default S3 method: document_term_matrix(x, vocabulary, ...) ## S3 method for class 'DocumentTermMatrix' document_term_matrix(x, ...) ## S3 method for class 'TermDocumentMatrix' document_term_matrix(x, ...) ## S3 method for class 'simple_triplet_matrix' document_term_matrix(x, ...)
x |
a data.frame with columns doc_id, term and freq indicating how many times a term occurred in that specific document. This is what |
vocabulary |
a character vector of terms which should be present in the document term matrix even if they did not occur in |
weight |
a column of |
... |
further arguments currently not used |
an sparse object of class dgCMatrix with in the rows the documents and in the columns the terms containing the frequencies
provided in x
extended with terms which were not in x
but were provided in vocabulary
.
The rownames of this resulting object contain the doc_id from x
data.frame
: Construct a document term matrix from a data.frame with columns doc_id, term, freq
matrix
: Construct a sparse document term matrix from a matrix
integer
: Construct a sparse document term matrix from an named integer vector
numeric
: Construct a sparse document term matrix from a named numeric vector
default
: Construct a document term matrix from a list of tokens
DocumentTermMatrix
: Convert an object of class DocumentTermMatrix
from the tm package to a sparseMatrix
TermDocumentMatrix
: Convert an object of class TermDocumentMatrix
from the tm package to a sparseMatrix with
the documents in the rows and the terms in the columns
simple_triplet_matrix
: Convert an object of class simple_triplet_matrix
from the slam package to a sparseMatrix
sparseMatrix
, document_term_frequencies
x <- data.frame(doc_id = c(1, 1, 2, 3, 4), term = c("A", "C", "Z", "X", "G"), freq = c(1, 5, 7, 10, 0)) document_term_matrix(x) document_term_matrix(x, vocabulary = LETTERS) ## Example on larger dataset data(brussels_reviews_anno) x <- document_term_frequencies(brussels_reviews_anno[, c("doc_id", "lemma")]) dtm <- document_term_matrix(x) dim(dtm) x <- document_term_frequencies(brussels_reviews_anno[, c("doc_id", "lemma")]) x <- document_term_frequencies_statistics(x) dtm <- document_term_matrix(x) dtm <- document_term_matrix(x, weight = "freq") dtm <- document_term_matrix(x, weight = "tf_idf") dtm <- document_term_matrix(x, weight = "bm25") x <- split(brussels_reviews_anno$lemma, brussels_reviews_anno$doc_id) dtm <- document_term_matrix(x) ## example showing the vocubulary argument ## allowing you to making sure terms which are not in the data are provided in the resulting dtm allterms <- unique(x$term) dtm <- document_term_matrix(head(x, 1000), vocabulary = allterms) ## example for a list of tokens x <- list(doc1 = c("aa", "bb", "cc", "aa", "b"), doc2 = c("bb", "bb", "dd", ""), doc3 = character(), doc4 = c("cc", NA), doc5 = character()) document_term_matrix(x) dtm <- document_term_matrix(x, vocabulary = c("a", "bb", "cc")) dtm <- dtm_conform(dtm, rows = c("doc1", "doc2", "doc7"), columns = c("a", "bb", "cc")) data(brussels_reviews) x <- strsplit(setNames(brussels_reviews$feedback, brussels_reviews$id), split = " +") x <- document_term_matrix(x) ## ## Example adding bigrams/trigrams to the document term matrix ## Mark that this can also be done using ?dtm_cbind ## library(data.table) x <- as.data.table(brussels_reviews_anno) x <- x[, token_bigram := txt_nextgram(token, n = 2), by = list(doc_id, sentence_id)] x <- x[, token_trigram := txt_nextgram(token, n = 3), by = list(doc_id, sentence_id)] x <- document_term_frequencies(x = x, document = "doc_id", term = c("token", "token_bigram", "token_trigram")) dtm <- document_term_matrix(x) ## ## Convert dense matrix to sparse matrix ## x <- matrix(c(0, 0, 0, 1, NA, 3, 4, 5, 6, 7), nrow = 2) x dtm <- document_term_matrix(x) dtm x <- matrix(c(0, 0, 0, 0.1, NA, 0.3, 0.4, 0.5, 0.6, 0.7), nrow = 2) x dtm <- document_term_matrix(x) dtm x <- setNames(c(TRUE, NA, FALSE, FALSE), c("a", "b", "c", "d")) x <- as.matrix(x) dtm <- document_term_matrix(x) dtm ## ## Convert vectors to sparse matrices ## x <- setNames(-3:3, c("a", "b", "c", "d", "e", "f")) dtm <- document_term_matrix(x) dtm x <- setNames(runif(6), c("a", "b", "c", "d", "e", "f")) dtm <- document_term_matrix(x) dtm ## ## Convert lists to sparse matrices ## x <- list(a = c("some", "set", "of", "words"), b1 = NA, b2 = NA, c1 = character(), c2 = 0, d = c("words", "words", "words")) dtm <- document_term_matrix(x) dtm
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.