R/01_class_03_kRp.corp.freq.R

# Copyright 2010-2021 Meik Michalke <meik.michalke@hhu.de>
#
# This file is part of the R package koRpus.
#
# koRpus is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# koRpus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with koRpus.  If not, see <http://www.gnu.org/licenses/>.

#' S4 Class kRp.corp.freq
#'
#' This class is used for objects that are returned by \code{\link[koRpus:read.corp.LCC]{read.corp.LCC}} and \code{\link[koRpus:read.corp.celex]{read.corp.celex}}.
#'
#' The slot \code{meta} simply contains all information from the "meta.txt" of the LCC[1] data and remains
#' empty for data from a Celex[2] DB.
#'
#' @section Contructor function:
#' Should you need to manually generate objects of this class (which should rarely be the case), the contructor function 
#' \code{kRp_corp_freq(...)} can be used instead of
#' \code{new("kRp.corp.freq", ...)}.
#'
#' @slot meta Metadata on the corpora (see details).
#' @slot words Absolute word frequencies. It has at least the following columns:
#'    \describe{
#'      \item{\code{num}:}{Some word ID from the DB, integer}
#'      \item{\code{word}:}{The word itself}
#'      \item{\code{lemma}:}{The lemma of the word}
#'      \item{\code{tag}:}{A part-of-speech tag}
#'      \item{\code{wclass}:}{The word class}
#'      \item{\code{lttr}:}{The number of characters}
#'      \item{\code{freq}:}{The frequency of that word in the corpus DB}
#'      \item{\code{pct}:}{Percentage of appearance in DB}
#'      \item{\code{pmio}:}{Appearance per million words in DB}
#'      \item{\code{log10}:}{Base 10 logarithm of word frequency}
#'      \item{\code{rank.avg}:}{Rank in corpus data, \code{\link{rank}} ties method "average"}
#'      \item{\code{rank.min}:}{Rank in corpus data, \code{\link{rank}} ties method "min"}
#'      \item{\code{rank.rel.avg}:}{Relative rank, i.e. percentile of \code{"rank.avg"}}
#'      \item{\code{rank.rel.min}:}{Relative rank, i.e. percentile of \code{"rank.min"}}
#'      \item{\code{inDocs}:}{The absolute number of documents in the corpus containing the word}
#'      \item{\code{idf}:}{The inverse document frequency}
#'    }
#'    The slot might have additional columns, depending on the input material.
#' @slot desc Descriptive information. It contains six numbers from the \code{meta} information, for convenient accessibility:
#'    \describe{
#'      \item{\code{tokens}:}{Number of running word forms}
#'      \item{\code{types}:}{Number of distinct word forms}
#'      \item{\code{words.p.sntc}:}{Average sentence length in words}
#'      \item{\code{chars.p.sntc}:}{Average sentence length in characters}
#'      \item{\code{chars.p.wform}:}{Average word form length}
#'      \item{\code{chars.p.word}:}{Average running word length}
#'    }
#'    The slot might have additional columns, depending on the input material.
#' @slot bigrams A data.frame listing all tokens that co-occurred next to each other in the corpus:
#'    \describe{
#'      \item{\code{token1}:}{The first token}
#'      \item{\code{token2}:}{The second token that appeared right next to the first}
#'      \item{\code{freq}:}{How often the co-occurrance was present}
#'      \item{\code{sig}:}{Log-likelihood significance of the co-occurrende}
#'    }
#' @slot cooccur Similar to \code{bigrams}, but listing co-occurrences anywhere in one sentence:
#'    \describe{
#'      \item{\code{token1}:}{The first token}
#'      \item{\code{token2}:}{The second token that appeared in the same sentence}
#'      \item{\code{freq}:}{How often the co-occurrance was present}
#'      \item{\code{sig}:}{Log-likelihood significance of the co-occurrende}
#'    }
#' @slot caseSens A single logical value, whether the frequency statistics were calculated case sensitive
#'    or not.
#' @name kRp.corp.freq,-class
#' @aliases kRp.corp.freq-class
#' @import methods
#' @keywords classes
# @author m.eik michalke \email{meik.michalke@@hhu.de}
#' @references
#' [1] \url{https://wortschatz.uni-leipzig.de/en/download/}
#' [2] \url{http://celex.mpi.nl}
#' @export kRp_corp_freq
#' @exportClass kRp.corp.freq
#' @rdname kRp.corp.freq-class

kRp_corp_freq <- setClass("kRp.corp.freq",
    representation=representation(
      meta="data.frame",
      words="data.frame",
      desc="data.frame",
      bigrams="data.frame",
      cooccur="data.frame",
      caseSens="logical"
    ),
    prototype(
    )
)

setMethod("initialize", "kRp.corp.freq",
  function(
    .Object,
    meta=data.frame(
        meta=character(),
        value=character()
      ),
    words=data.frame(
        num=numeric(),
        word=character(),
        lemma=character(),
        tag=character(),
        wclass=character(),
        lttr=numeric(),
        freq=numeric(),
        pct=numeric(),
        pmio=numeric(),
        log10=numeric(),
        rank.avg=numeric(),
        rank.min=numeric(),
        rank.rel.avg=numeric(),
        rank.rel.min=numeric(),
        inDocs=numeric(),
        idf=numeric()
      ),
    desc=data.frame(
        tokens=character(),
        types=character(),
        words.p.sntc=numeric(),
        chars.p.sntc=numeric(),
        chars.p.wform=numeric(),
        chars.p.word=numeric()
      ),
    bigrams=data.frame(
        token1=character(),
        token2=character(),
        freq=numeric(),
        sig=numeric()
      ),
    cooccur=data.frame(
        token1=character(),
        token2=character(),
        freq=numeric(),
        sig=numeric()
      ),
    caseSens=FALSE
  ){
    slot(.Object, "meta") <- meta
    slot(.Object, "words") <- words
    slot(.Object, "desc") <- desc
    slot(.Object, "bigrams") <- bigrams
    slot(.Object, "cooccur") <- cooccur
    slot(.Object, "caseSens") <- caseSens
    validObject(.Object)
    return(.Object)
  }
)

setValidity("kRp.corp.freq", function(object){
    meta <- slot(object, "meta")
    words <- slot(object, "words")
    desc <- slot(object, "desc")
    bigrams <- slot(object, "bigrams")
    cooccur <- slot(object, "cooccur")

    meta.names <- dimnames(meta)[[2]]
    words.names <- dimnames(words)[[2]]
    desc.names <- dimnames(desc)[[2]]
    bigrams.names <- dimnames(bigrams)[[2]]
    cooccur.names <- dimnames(cooccur)[[2]]

  if(!identical(meta.names, c("meta", "value"))){
    stop(simpleError("Invalid object: Wrong column names in slot \"meta\"."))
  } else {}
  if(!identical(words.names, c(
    "num", "word", "lemma", "tag", "wclass", "lttr", "freq", "pct", "pmio", "log10",
    "rank.avg", "rank.min", "rank.rel.avg", "rank.rel.min", "inDocs", "idf"))){
    stop(simpleError("Invalid object: Wrong column names in slot \"words\"."))
  } else {}
  if(!identical(desc.names, c("tokens", "types", "words.p.sntc", "chars.p.sntc", "chars.p.wform", "chars.p.word"))){
    stop(simpleError("Invalid object: Wrong column names in slot \"desc\"."))
  } else {}
  if(!identical(bigrams.names, c("token1", "token2", "freq", "sig"))){
    stop(simpleError("Invalid object: Wrong column names in slot \"bigrams\"."))
  } else {}
  if(!identical(cooccur.names, c("token1", "token2", "freq", "sig"))){
    stop(simpleError("Invalid object: Wrong column names in slot \"cooccur\"."))
  } else {}
  
  return(TRUE)
})

Try the koRpus package in your browser

Any scripts or data that you put into this service are public.

koRpus documentation built on May 18, 2021, 1:13 a.m.