R/koRpus-internal.freq.analysis.R
In koRpus: Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

# Copyright 2010-2019 Meik Michalke <meik.michalke@hhu.de>
#
# This file is part of the R package koRpus.
#
# koRpus is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# koRpus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with koRpus.  If not, see <http://www.gnu.org/licenses/>.

# this internal function does the real frequency analysis,
# so it's mostly called by freq.analysis()

###############################################################
## if this signature changes, check freq.analysis() as well! ##
###############################################################

## function kRp.freq.analysis.calc()
kRp.freq.analysis.calc <- function(
  txt.file,
  corp.freq=NULL,
  desc.stat=TRUE,
  corp.rm.class="nonpunct",
  corp.rm.tag=c()
){
  lang <- language(txt.file)

  if(identical(corp.rm.class, "nonpunct")){
    corp.rm.class <- kRp.POS.tags(lang, tags=c("punct","sentc"), list.classes=TRUE)
  } else {}

  if(all(hasFeature(txt.file, "corp_freq"), is.null(corp.freq))){
    corp.freq <- corpusCorpFreq(txt.file)
  } else {}

  if(!is.null(corp.freq)){
    # before we even start, check if we're alright:
    stopifnot(inherits(corp.freq, "kRp.corp.freq"))
    frequency.pre <- text.freq.analysis(
      txt.commented=taggedText(txt.file),
      corp.freq=corp.freq,
      corp.rm.class=corp.rm.class,
      corp.rm.tag=corp.rm.tag,
      lang=lang
    )
    # commented will be overwritten with a new version containing percentages for each word
    taggedText(txt.file) <- frequency.pre[["commented"]]
    corpusFreq(txt.file) <- frequency.pre[["freq.analysis"]]
  } else {
    corpusFreq(txt.file) <- list(NA)
  }

  if(isTRUE(desc.stat)){
    describe(txt.file) <- text.analysis(frequency.pre[["commented"]], lang=lang, corp.rm.class=corp.rm.class, corp.rm.tag=corp.rm.tag, desc=describe(txt.file))
  } else {}

  return(txt.file)
} ## end function kRp.freq.analysis.calc()


## function frqcy.summarize()
frqcy.summarize <- function(pct.data, na.rm=TRUE){
  summary.pct <- summary(pct.data)
  sd.pct <- sd(pct.data, na.rm=na.rm)
  quant.pct <- quantile(pct.data, probs=seq(0,1,0.05), na.rm=na.rm)
  results <- list(summary=summary.pct, sd=sd.pct, quantiles=quant.pct)
  return(results)
} ## end function frqcy.summarize()


## function frqcy.by.rel()
frqcy.by.rel <- function(txt.commented, corp.freq, corp.rm.class, corp.rm.tag, rel){
  # to avoid needless NOTEs from R CMD check
  wclass <- tag <- rel.col <- NULL

  # look up percentages for each part of the text
  # call internal function word.freq()
  txt <- as.character(txt.commented[["token"]])
  txt.rel.all <- as.numeric(word.freq(txt, corp.freq=corp.freq, rel=rel, zero.NAs=TRUE))
  # add percent info to the commented text
  txt.commented <- cbind(txt.commented, rel.col=txt.rel.all)

  # now let's do some calculations...
  frqcy.summary <- frqcy.summarize(txt.rel.all)

  # there are probably some NAs in our text. they're the result of words not found
  # in the language corpus, hence we'll compute another summary and assume NA equals to probabilty of 0
  txt.rel.noNAs <- txt.rel.all
  frqcy.summary.noNAs <- frqcy.summarize(txt.rel.noNAs)
  # if defined, remove entries of certain classes from the word list
  txt.dropped.classes <- droplevels(subset(txt.commented, !wclass %in% corp.rm.class)) 
  txt.dropped.classes <- droplevels(subset(txt.dropped.classes, !tag %in% corp.rm.tag)) 
  txt.rel.sub <- subset(txt.dropped.classes, select = rel.col)$rel.col
  # exclude NAs as well, with probablility = 0
  frqcy.summary.excluded <- frqcy.summarize(txt.rel.sub)
  # and finally, analyze only unique words
  txt.types <- unique(txt.dropped.classes)
  txt.rel.types <- subset(txt.types, select = rel.col)$rel.col
  frqcy.summary.types <- frqcy.summarize(txt.rel.types)

  results <- list(
    rel.col=txt.rel.all,
    summary.known=frqcy.summary,
    summary.all=frqcy.summary.noNAs,
    summary.excluded=frqcy.summary.excluded,
    summary.types=frqcy.summary.types)
  return(results)
} ## end function frqcy.by.rel()


## function frqcy.of.types()
# takes a vector of tokens (and optionally types) and returns a list with two
# possible entries, bot a data.fame with two columns: tokens/types and its absolute frequency
frqcy.of.types <- function(tokens, byTypes=TRUE, byTokens=TRUE){
  result <- list(byTypes=NA, byTokens=NA)
  freq.table <- table(tokens)
  types <- names(freq.table)

  if(isTRUE(byTypes)){
    result[["byTypes"]] <- data.frame(
      type=types,
      freq=as.vector(freq.table),
      row.names=types,
      stringsAsFactors=FALSE)
  } else {}

  if(isTRUE(byTokens)){
    result[["byTokens"]] <- data.frame(
      token=tokens,
      freq=as.vector(freq.table[tokens]),
      stringsAsFactors=FALSE)
  } else {}
  
  return(result)
} ## end function frqcy.of.types()


## function text.freq.analysis()
# expects tagged text, commented text and valid corp.freq objects
text.freq.analysis <- function(
  txt.commented,
  corp.freq,
  corp.rm.class,
  corp.rm.tag,
  lang
){
  # to avoid needless NOTEs from R CMD check
  wclass <- NULL

  stopifnot(inherits(corp.freq, "kRp.corp.freq"))

  frq.data.rel <- c("pmio", "log10", "rank.rel.avg", "rank.rel.min", "idf")
  frq.data <- lapply(
    frq.data.rel,
    function(this_rel){
      return(
        frqcy.by.rel(
          txt.commented,
          corp.freq=corp.freq,
          corp.rm.class=corp.rm.class,
          corp.rm.tag=corp.rm.tag,
          rel=this_rel
        )
      )
    }
  )
  names(frq.data) <- frq.data.rel
  # frequency of types in this document
  type.freq <- frqcy.of.types(tokens=txt.commented[["token"]], byTypes=FALSE, byTokens=TRUE)
  tfidf <- type.freq[["byTokens"]][["freq"]] * frq.data[["idf"]][["rel.col"]]

  txt.commented[["pmio"]] <- frq.data[["pmio"]][["rel.col"]]
  txt.commented[["log10"]] <- frq.data[["log10"]][["rel.col"]]
  txt.commented[["rank.avg"]] <- frq.data[["rank.rel.avg"]][["rel.col"]]
  txt.commented[["rank.min"]] <- frq.data[["rank.rel.min"]][["rel.col"]]
  txt.commented[["tf"]] <- type.freq[["byTokens"]][["freq"]]
  txt.commented[["idf"]] <- frq.data[["idf"]][["rel.col"]]
  txt.commented[["tfidf"]] <- tfidf
            
  # information on words-per-sentence and commas-per-sentence
  num.sentences <- sum(txt.commented[["wclass"]] %in% kRp.POS.tags(lang, tags="sentc", list.classes=TRUE))
  num.commas    <- sum(txt.commented[["wclass"]] %in% "comma")
  freq.commas   <- num.commas/num.sentences
  num.words     <- sum(!txt.commented[["wclass"]] %in% kRp.POS.tags(lang, tags=c("punct","sentc"), list.classes=TRUE))
  freq.words    <- num.words/num.sentences
  freq.w.p.c    <- num.words/num.commas
  res.sentences <- data.frame(words.p.sntc=freq.words, comma.p.sntc=freq.commas, words.p.comma=freq.w.p.c)

  freq.analysis <- list(
    frq.pmio=frq.data[["pmio"]][-c(1,2)],
    frq.log10=frq.data[["log10"]][-c(1,2)],
    frq.rank.avg=frq.data[["rank.rel.avg"]][-c(1,2)],
    frq.rank.min=frq.data[["rank.rel.min"]][-c(1,2)],
    sentence.factors=res.sentences
  )

  results <- list(commented=txt.commented, freq.analysis=freq.analysis)

  return(results)
} ## end function text.freq.analysis()

Any scripts or data that you put into this service are public.

koRpus documentation built on May 18, 2021, 1:13 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

koRpus
Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

R/koRpus-internal.freq.analysis.R
In koRpus: Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

Defines functions text.freq.analysis frqcy.of.types frqcy.by.rel frqcy.summarize kRp.freq.analysis.calc

Try the koRpus package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

koRpus Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

R/koRpus-internal.freq.analysis.R In koRpus: Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

Defines functions text.freq.analysis frqcy.of.types frqcy.by.rel frqcy.summarize kRp.freq.analysis.calc

Try the koRpus package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

koRpus
Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

R/koRpus-internal.freq.analysis.R
In koRpus: Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity