Nothing
###
# Corpus Readers
###
#' Read in a corpus file.
#'
#' Converts pre-processed document matrices stored in popular formats to stm
#' format.
#'
#' This function provides a simple utility for converting other document
#' formats to our own. Briefly- \code{dtm} takes as input a standard matrix
#' and converts to our format. \code{slam} converts from the
#' \code{simple_triplet_matrix} representation used by the \code{slam} package.
#' This is also the representation of corpora in the popular \code{tm} package
#' and should work in those cases.
#'
#' \code{dtm} expects a matrix object where each row represents a document and
#' each column represents a word in the dictionary.
#'
#' \code{slam} expects a \code{\link[slam]{simple_triplet_matrix}} from that
#' package.
#'
#' \code{Matrix} attempts to coerce the matrix to a
#' \code{\link[slam]{simple_triplet_matrix}} and convert using the
#' functionality built for the \code{slam} package. This will work for most
#' applicable classes in the \code{Matrix} package such as \code{dgCMatrix}.
#'
#' If you are trying to read a \code{.ldac} file see \code{\link{readLdac}}.
#'
#' @param corpus An input file or filepath to be processed
#' @param type The type of input file. We offer several sources, see details.
#' @return \item{documents}{A documents object in our format} \item{vocab}{A
#' vocab object if information is available to construct one}
#' @seealso \code{\link{textProcessor}}, \code{\link{prepDocuments}} \code{\link{readLdac}}
#' @examples
#'
#' \dontrun{
#'
#' library(textir)
#' data(congress109)
#' out <- readCorpus(congress109Counts, type="Matrix")
#' documents <- out$documents
#' vocab <- out$vocab
#' }
#' @export
readCorpus <- function(corpus, type=c("dtm", "slam", "Matrix")) {
type <- match.arg(type)
switch(type,
dtm = read.dtm(corpus),
slam = read.slam(corpus),
Matrix = read.slam(slam::as.simple_triplet_matrix(corpus)))
}
#' Read in a .ldac Formatted File
#'
#' Read in a term document matrix in the .ldac sparse matrix format popularized
#' by David Blei's C code implementation of lda.
#'
#' \code{ldac} expects a file name or path that contains a file in Blei's LDA-C
#' format. From his ReadMe: "The data is a file where each line is of the form:
#'
#' [M] [term_1]:[count] [term_2]:[count] ... [term_N]:[count]
#'
#' where [M] is the number of unique terms in the document, and the [count]
#' associated with each term is how many times that term appeared in the
#' document. Note that [term_1] is an integer which indexes the term; it is
#' not a string."
#'
#' Because R indexes from one, the values of the term indices are incremented
#' by one on import.
#'
#' @param filename An input file or filepath to be processed
#' @return \item{documents}{A documents object in our format}
#' @seealso \code{\link{textProcessor}}, \code{\link{prepDocuments}} \code{\link{readCorpus}}
#' @export
readLdac <- function(filename) {
#Read the .ldac format
# Based on Jonathan Chang's reader with addition of zero correction.
d <- scan(filename, what = "character", sep = "\n")
d <- chartr(":", " ", d)
d <- strsplit(d, " ", fixed = TRUE)
d <- lapply(d, function(x) matrix(as.integer(x[-1]), nrow = 2))
mapply(function(x) rbind(x[1,]+1, x[2,]), d) #zero correction
}
read.dtm <- function(dtm) {
#test for and adjust for mispecification
if(inherits(dtm,"simple_triplet_matrix")) {
warning("Please use the slam option. dtm is for dense matrices.")
read.slam(dtm)
}
#convert a standard document-term matrix to list format.
dtm.mat <- as.matrix(dtm)
vocab <- colnames(dtm)
if(any(dtm.mat==0)) {
#if the dtm is not sparse we have to use a slightly slower method
#to avoid it coercing back to a matrix
documents <- lapply(split(dtm.mat, row(dtm.mat)), function(y) {
rbind(which(y > 0), as.integer(y[y > 0])) })
names(documents) <- NULL #we overwrite the automatically generated labels to match other method
} else {
#the more usual sparse matrix case
documents <- apply(dtm.mat, 1, function(y) {
rbind(which(y > 0), as.integer(y[y > 0])) })
}
return(list(documents=documents, vocab=vocab))
}
read.slam <- function(corpus) {
#convert a simple triplet matrix to list format.
if(!inherits(corpus, "simple_triplet_matrix")) stop("corpus is not a simple triplet matrix")
if (inherits(corpus,"TermDocumentMatrix")) {
non_empty_docs <- which(slam::col_sums(corpus) != 0)
documents <- ijv.to.doc(corpus[,non_empty_docs]$j, corpus[,non_empty_docs]$i, corpus[,non_empty_docs]$v)
names(documents) <- corpus[,non_empty_docs]$dimnames$Docs
} else {
non_empty_docs <- which(slam::row_sums(corpus) != 0)
documents <- ijv.to.doc(corpus[non_empty_docs,]$i, corpus[non_empty_docs,]$j, corpus[non_empty_docs,]$v)
names(documents) <- corpus[non_empty_docs,]$dimnames$Docs
}
vocab <- corpus$dimnames$Terms
return(list(documents=documents,vocab=vocab))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.