koRpus: Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

# Copyright 2010-2021 Meik Michalke <meik.michalke@hhu.de>
#
# This file is part of the R package koRpus.
#
# koRpus is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# koRpus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with koRpus.  If not, see <http://www.gnu.org/licenses/>.

init.kRp.text.df <- function(rows=1){
  val <- rep(NA, rows)
  return(data.frame(
    doc_id=factor(val),
    token=val,
    tag=factor(val),
    lemma=val,
    lttr=val,
    wclass=factor(val),
    desc=factor(val),
    stop=val,
    stem=val,
    idx=val,
    sntc=val
  ))
}

valid.tokens.kRp.text <- colnames(init.kRp.text.df())

#' S4 Class kRp.text
#'
#' This class is used for objects that are returned by \code{\link[koRpus:treetag]{treetag}} or \code{\link[koRpus:tokenize]{tokenize}}.
#'
#' @section Contructor function:
#' Should you need to manually generate objects of this class (which should rarely be the case), the contructor function 
#' \code{kRp_text(...)} can be used instead of
#' \code{new("kRp.text", ...)}.
#'
#' @slot lang A character string, naming the language that is assumed for the tokenized text in this object.
#' @slot desc Descriptive statistics of the tagged text.
#' @slot tokens Results of the called tokenizer and POS tagger. The data.frame usually has eleven columns:
#'    \describe{
#'      \item{\code{doc_id}:}{Factor, optional document identifier.}
#'      \item{\code{token}:}{Character, the tokenized text.}
#'      \item{\code{tag}:}{Factor, POS tags for each token.}
#'      \item{\code{lemma}:}{Character, lemma for each token.}
#'      \item{\code{lttr}:}{Integer, number of letters.}
#'      \item{\code{wclass}:}{Factor, word class.}
#'      \item{\code{desc}:}{Factor, a short description of the POS tag.}
#'      \item{\code{stop}:}{Logical, \code{TRUE} if token is a stopword.}
#'      \item{\code{stem}:}{Character, stemmed token.}
#'      \item{\code{idx}:}{Integer, index number of token in this document.}
#'      \item{\code{sntc}:}{Integer, number of sentence in this document.}
#'    }
#'    This data.frame structure adheres to the "Text Interchange Formats" guidelines set out by rOpenSci[1].
#' @slot features A named logical vector, indicating which features are available in this object's \code{feat_list} slot.
#'    Common features are listed in the description of the \code{feat_list} slot.
#' @slot feat_list A named list with optional analysis results or other content as used by the defined \code{features}:
#'    \itemize{
#'      \item{\code{hyphen} }{A named list of objects of class \code{\link[sylly:kRp.hyphen-class]{kRp.hyphen}}.}
#'      \item{\code{readability} }{A named list of objects of class \code{\link[koRpus:kRp.readability-class]{kRp.readability}}.}
#'      \item{\code{lex_div} }{A named list of objects of class \code{\link[koRpus:kRp.TTR-class]{kRp.TTR}}.}
#'      \item{\code{freq} }{A list with additional results of \code{\link[koRpus:freq.analysis]{freq.analysis}}.}
#'      \item{\code{corp_freq} }{An object of class \code{\link[koRpus:kRp.corp.freq-class]{kRp.corp.freq}}, e.g., results of a call to
#'        \code{\link[koRpus:read.corp.custom]{read.corp.custom}}.}
#'      \item{\code{diff} }{Additional results of calls to a method like \code{\link[koRpus:textTransform]{textTransform}}.}
#      \item{\code{summary} }{A summary data frame for the full corpus, including descriptive statistics on all texts, as well as
#        results of analyses like readability and lexical diversity, if available.}
#'      \item{\code{doc_term_matrix} }{A sparse document-term matrix, as produced by \code{\link[koRpus:docTermMatrix]{docTermMatrix}}.}
#      \item{\code{stopwords} }{A numeric vector with the total number of stopwords in each text, if stopwords were analyzed during tokenizing or POS tagging.}
#      \item{\code{} }{}
#'    }
#'    See the \code{\link[koRpus:kRp.text_get-methods]{getter and setter methods}} for easy access to these sub-slots.
#'    There can actually be any number of additional features, the above is just a list of those already defined by this package.
#' @note There is also \code{as()} methods to transform objects from other koRpus classes into kRp.text.
#' @name kRp.text,-class
#' @aliases kRp.text-class
#' @import methods
#' @references
#'    [1] Text Interchange Formats (\url{https://github.com/ropensci/tif})
#' @keywords classes
# @author m.eik michalke \email{meik.michalke@@hhu.de}
#' @export kRp_text
#' @exportClass kRp.text
#' @rdname kRp.text-class

kRp_text <- setClass("kRp.text",
    representation=representation(
      lang="character",
      desc="list",
      tokens="data.frame",
      features="logical",
      feat_list="list"
    )
)

setMethod("initialize", "kRp.text",
  function(
    .Object,
    lang=character(),
    desc=list(),
    tokens=init.kRp.text.df(),
    features=logical(),
    feat_list=list()
  ){
    slot(.Object, "lang") <- lang
    slot(.Object, "desc") <- desc
    slot(.Object, "tokens") <- tokens
    slot(.Object, "features") <- features
    slot(.Object, "feat_list") <- feat_list
    validObject(.Object)
    return(.Object)
  }
)


setValidity("kRp.text", function(object){
  features <- slot(object, "features")
  feat_list <- slot(object, "feat_list")
  hyphen <- feat_list[["hyphen"]]
  readability <- feat_list[["readability"]]
  lex_div <- feat_list[["lex_div"]]
  freq <- feat_list[["freq"]]
  corp_freq <- feat_list[["corp_freq"]]
  # obj_summary <- feat_list[["summary"]]

  announced_features <- names(features[features])
  missingFeatures <- sapply(feat_list[announced_features], is.null)
  if(any(missingFeatures)){
    warning(
      paste0(
        "Invalid object: There's no data for activated \"features\":\n  ",
        paste0(announced_features[missingFeatures], collapse=", ")
      ),
      call.=FALSE
    )
  } else {}

  is_valid <- validate_df(
    df=slot(object, "tokens"),
    valid_cols=valid.tokens.kRp.text,
    strict=FALSE,
    warn_only=TRUE,
    name="tokens"
  )

  if(!isTRUE(is_valid)){
    warning("Try fixObject() to fix the issues.", call.=FALSE)
  } else {}

  if(!is.character(slot(object, "lang"))){
    stop(simpleError("Invalid object: Slot \"lang\" must be of class character!"))
  } else {}

  classObj <- list(
    "kRp.hyphen"=list(name="hyphen", obj=hyphen),
    "kRp.readability"=list(name="readability", obj=readability),
    "kRp.TTR"=list(name="lex_div", obj=lex_div),
    "kRp.corp.freq"=list(name="corp_freq", obj=corp_freq)#,
    #"data.frame"=list(name="summary", obj=obj_summary)
  )
  for (thisClassObj in names(classObj)) {
    if(
      all(
        !identical(classObj[[thisClassObj]][["obj"]], list()),
        !is.null(classObj[[thisClassObj]][["obj"]]),
        !all(sapply(classObj[[thisClassObj]][["obj"]], function(x) inherits(x, thisClassObj)))
      )
    ){
      stop(simpleError(paste0("Invalid object: Slot \"", classObj[[thisClassObj]][["name"]], "\" must have entries inheriting from class ", thisClassObj, "!")))
    } else {}
  }

  return(TRUE)
})

## method validate_df()
# checks whether a given data frame provides all columns expected
#  - strict: if TRUE only allows columns defined by valid_cols
setGeneric(
  "validate_df",
  function(
    df,
    valid_cols=valid.tokens.kRp.text,
    strict=TRUE,
    warn_only=TRUE,
    name="tokens"
  ) standardGeneric("validate_df")
)
setMethod("validate_df",
  signature=signature(df="data.frame"),
  function(
    df,
    valid_cols=valid.tokens.kRp.text,
    strict=TRUE,
    warn_only=TRUE,
    name="tokens"
  ){
    df_cols <- colnames(df)
    result <- TRUE

    if(!identical(df_cols, valid_cols)){
      if(isTRUE(strict)){
        wrongCols <- df_cols[!df_cols %in% valid_cols]
        if(length(wrongCols) > 0){
          result <- FALSE
          wrongCols_msg <- paste0(
            "Invalid object: Wrong columns in data frame \"", name, "\":\n  ",
            paste0(wrongCols, collapse=", ")
          )
          ifelse(
            isTRUE(warn_only),
            warning(wrongCols_msg, call.=FALSE),
            stop(simpleError(wrongCols_msg))
          )
        } else {}
      } else {}
      missingCols <- valid_cols[!valid_cols %in% df_cols]
      if(length(missingCols) > 0){
        result <- FALSE
        missingCols_msg <- paste0(
          "Invalid object: Missing columns in data frame \"", name, "\":\n  ",
          paste0(missingCols, collapse=", ")
        )
        ifelse(
          isTRUE(warn_only),
          warning(missingCols_msg, call.=FALSE),
          stop(simpleError(missingCols_msg))
        )
      }
    } else {}
    return(result)
  }
)

unDocUMeantIt/koRpus documentation built on May 21, 2021, 9:26 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

unDocUMeantIt/koRpus
Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

R/01_class_01_kRp.text.R
In unDocUMeantIt/koRpus: Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

Defines functions init.kRp.text.df

R Package Documentation

Browse R Packages

We want your feedback!

unDocUMeantIt/koRpus Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

R/01_class_01_kRp.text.R In unDocUMeantIt/koRpus: Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

Defines functions init.kRp.text.df

R Package Documentation

Browse R Packages

We want your feedback!

unDocUMeantIt/koRpus
Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

R/01_class_01_kRp.text.R
In unDocUMeantIt/koRpus: Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity