Nothing
# Copyright 2010-2019 Meik Michalke <meik.michalke@hhu.de>
#
# This file is part of the R package koRpus.
#
# koRpus is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# koRpus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with koRpus. If not, see <http://www.gnu.org/licenses/>.
# this internal function does the real frequency analysis,
# so it's mostly called by freq.analysis()
###############################################################
## if this signature changes, check freq.analysis() as well! ##
###############################################################
## function kRp.freq.analysis.calc()
kRp.freq.analysis.calc <- function(
txt.file,
corp.freq=NULL,
desc.stat=TRUE,
corp.rm.class="nonpunct",
corp.rm.tag=c()
){
lang <- language(txt.file)
if(identical(corp.rm.class, "nonpunct")){
corp.rm.class <- kRp.POS.tags(lang, tags=c("punct","sentc"), list.classes=TRUE)
} else {}
if(all(hasFeature(txt.file, "corp_freq"), is.null(corp.freq))){
corp.freq <- corpusCorpFreq(txt.file)
} else {}
if(!is.null(corp.freq)){
# before we even start, check if we're alright:
stopifnot(inherits(corp.freq, "kRp.corp.freq"))
frequency.pre <- text.freq.analysis(
txt.commented=taggedText(txt.file),
corp.freq=corp.freq,
corp.rm.class=corp.rm.class,
corp.rm.tag=corp.rm.tag,
lang=lang
)
# commented will be overwritten with a new version containing percentages for each word
taggedText(txt.file) <- frequency.pre[["commented"]]
corpusFreq(txt.file) <- frequency.pre[["freq.analysis"]]
} else {
corpusFreq(txt.file) <- list(NA)
}
if(isTRUE(desc.stat)){
describe(txt.file) <- text.analysis(frequency.pre[["commented"]], lang=lang, corp.rm.class=corp.rm.class, corp.rm.tag=corp.rm.tag, desc=describe(txt.file))
} else {}
return(txt.file)
} ## end function kRp.freq.analysis.calc()
## function frqcy.summarize()
frqcy.summarize <- function(pct.data, na.rm=TRUE){
summary.pct <- summary(pct.data)
sd.pct <- sd(pct.data, na.rm=na.rm)
quant.pct <- quantile(pct.data, probs=seq(0,1,0.05), na.rm=na.rm)
results <- list(summary=summary.pct, sd=sd.pct, quantiles=quant.pct)
return(results)
} ## end function frqcy.summarize()
## function frqcy.by.rel()
frqcy.by.rel <- function(txt.commented, corp.freq, corp.rm.class, corp.rm.tag, rel){
# to avoid needless NOTEs from R CMD check
wclass <- tag <- rel.col <- NULL
# look up percentages for each part of the text
# call internal function word.freq()
txt <- as.character(txt.commented[["token"]])
txt.rel.all <- as.numeric(word.freq(txt, corp.freq=corp.freq, rel=rel, zero.NAs=TRUE))
# add percent info to the commented text
txt.commented <- cbind(txt.commented, rel.col=txt.rel.all)
# now let's do some calculations...
frqcy.summary <- frqcy.summarize(txt.rel.all)
# there are probably some NAs in our text. they're the result of words not found
# in the language corpus, hence we'll compute another summary and assume NA equals to probabilty of 0
txt.rel.noNAs <- txt.rel.all
frqcy.summary.noNAs <- frqcy.summarize(txt.rel.noNAs)
# if defined, remove entries of certain classes from the word list
txt.dropped.classes <- droplevels(subset(txt.commented, !wclass %in% corp.rm.class))
txt.dropped.classes <- droplevels(subset(txt.dropped.classes, !tag %in% corp.rm.tag))
txt.rel.sub <- subset(txt.dropped.classes, select = rel.col)$rel.col
# exclude NAs as well, with probablility = 0
frqcy.summary.excluded <- frqcy.summarize(txt.rel.sub)
# and finally, analyze only unique words
txt.types <- unique(txt.dropped.classes)
txt.rel.types <- subset(txt.types, select = rel.col)$rel.col
frqcy.summary.types <- frqcy.summarize(txt.rel.types)
results <- list(
rel.col=txt.rel.all,
summary.known=frqcy.summary,
summary.all=frqcy.summary.noNAs,
summary.excluded=frqcy.summary.excluded,
summary.types=frqcy.summary.types)
return(results)
} ## end function frqcy.by.rel()
## function frqcy.of.types()
# takes a vector of tokens (and optionally types) and returns a list with two
# possible entries, bot a data.fame with two columns: tokens/types and its absolute frequency
frqcy.of.types <- function(tokens, byTypes=TRUE, byTokens=TRUE){
result <- list(byTypes=NA, byTokens=NA)
freq.table <- table(tokens)
types <- names(freq.table)
if(isTRUE(byTypes)){
result[["byTypes"]] <- data.frame(
type=types,
freq=as.vector(freq.table),
row.names=types,
stringsAsFactors=FALSE)
} else {}
if(isTRUE(byTokens)){
result[["byTokens"]] <- data.frame(
token=tokens,
freq=as.vector(freq.table[tokens]),
stringsAsFactors=FALSE)
} else {}
return(result)
} ## end function frqcy.of.types()
## function text.freq.analysis()
# expects tagged text, commented text and valid corp.freq objects
text.freq.analysis <- function(
txt.commented,
corp.freq,
corp.rm.class,
corp.rm.tag,
lang
){
# to avoid needless NOTEs from R CMD check
wclass <- NULL
stopifnot(inherits(corp.freq, "kRp.corp.freq"))
frq.data.rel <- c("pmio", "log10", "rank.rel.avg", "rank.rel.min", "idf")
frq.data <- lapply(
frq.data.rel,
function(this_rel){
return(
frqcy.by.rel(
txt.commented,
corp.freq=corp.freq,
corp.rm.class=corp.rm.class,
corp.rm.tag=corp.rm.tag,
rel=this_rel
)
)
}
)
names(frq.data) <- frq.data.rel
# frequency of types in this document
type.freq <- frqcy.of.types(tokens=txt.commented[["token"]], byTypes=FALSE, byTokens=TRUE)
tfidf <- type.freq[["byTokens"]][["freq"]] * frq.data[["idf"]][["rel.col"]]
txt.commented[["pmio"]] <- frq.data[["pmio"]][["rel.col"]]
txt.commented[["log10"]] <- frq.data[["log10"]][["rel.col"]]
txt.commented[["rank.avg"]] <- frq.data[["rank.rel.avg"]][["rel.col"]]
txt.commented[["rank.min"]] <- frq.data[["rank.rel.min"]][["rel.col"]]
txt.commented[["tf"]] <- type.freq[["byTokens"]][["freq"]]
txt.commented[["idf"]] <- frq.data[["idf"]][["rel.col"]]
txt.commented[["tfidf"]] <- tfidf
# information on words-per-sentence and commas-per-sentence
num.sentences <- sum(txt.commented[["wclass"]] %in% kRp.POS.tags(lang, tags="sentc", list.classes=TRUE))
num.commas <- sum(txt.commented[["wclass"]] %in% "comma")
freq.commas <- num.commas/num.sentences
num.words <- sum(!txt.commented[["wclass"]] %in% kRp.POS.tags(lang, tags=c("punct","sentc"), list.classes=TRUE))
freq.words <- num.words/num.sentences
freq.w.p.c <- num.words/num.commas
res.sentences <- data.frame(words.p.sntc=freq.words, comma.p.sntc=freq.commas, words.p.comma=freq.w.p.c)
freq.analysis <- list(
frq.pmio=frq.data[["pmio"]][-c(1,2)],
frq.log10=frq.data[["log10"]][-c(1,2)],
frq.rank.avg=frq.data[["rank.rel.avg"]][-c(1,2)],
frq.rank.min=frq.data[["rank.rel.min"]][-c(1,2)],
sentence.factors=res.sentences
)
results <- list(commented=txt.commented, freq.analysis=freq.analysis)
return(results)
} ## end function text.freq.analysis()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.