#' Provides basic frequency stats for an annotated corpus.
#'
#' Summarizes token frequency, type frequency, # of sentences by corpus/text/(optionally) genre.
#' @name clr_desc_corpus
#' @return A dataframe
#' @import data.table
#'
#'
#' @export
#' @rdname clr_desc_corpus
clr_desc_corpus <- function (corp,
doc ='id',
sent='sid',
tok='word',
upos='upos',
genre=NULL) {
x <- corp
if ("meta" %in% names(x)) x <- x$corpus
if (!is.data.frame(x)) {x <- rbindlist(x)}
x <- as.data.table(x)
byText <- x[upos!="PUNCT", list(textLength=.N,textType=length(unique(get(tok))),textSent=length(unique(get(sent)))), by=doc]
byText <- byText[order(as.numeric(gsub('[A-Za-z]+','',get(doc))))]
corpus <- x[upos!="PUNCT", list(n_docs=length(unique(get(doc))),textLength=.N,textType=length(unique(get(tok))),textSent=length(unique(paste(get(doc),get(sent), sep=""))))]
if (is.null(genre) == FALSE){
setDT(corp$meta)
x <- x[corp$meta, on=c("doc_id"), nomatch=0]
byGenre <- x[upos!="PUNCT", list(n_docs=length(unique(get(doc))),textLength=.N,textType=length(unique(get(tok))),textSent=length(unique(paste(get(doc),get(sent), sep="")))), by=genre]
out <- list("text" = byText, "corpus" = corpus, "genre" = byGenre)
return(out)
} else{
out <- list("text" = byText, "corpus" = corpus)
return(out)}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.