R/02_method_get_set_kRp.text.R

Defines functions is.kRp.text is.taggedText

Documented in is.kRp.text is.taggedText

# Copyright 2010-2021 Meik Michalke <meik.michalke@hhu.de>
#
# This file is part of the R package koRpus.
#
# koRpus is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# koRpus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with koRpus.  If not, see <http://www.gnu.org/licenses/>.


#' Getter/setter methods for koRpus objects
#' 
#' These methods should be used to get or set values of tagged text objects
#' generated by koRpus functions like \code{\link[koRpus:treetag]{treetag}} or \code{\link[koRpus:tokenize]{tokenize}}.
#' 
#' \itemize{
#'   \item{\code{taggedText()} }{returns the \code{tokens} slot.}
#'   \item{\code{doc_id()} }{Returns a character vector of all \code{doc_id} values in the object.}
#'   \item{\code{describe()} }{returns the \code{desc} slot.}
#'   \item{\code{language()} }{returns the \code{lang} slot.}
#'   \item{\code{[}/\code{[[} }{Can be used as a shortcut to index the results of \code{taggedText()}.}
#'   \item{\code{fixObject} }{returns the same object upgraded to the object structure of this package version (e.g., new columns, changed names, etc.).}
#'   \item{\code{hasFeature()} }{returns \code{TRUE} or code{FALSE}, depending on whether the requested feature is present or not.}
#'   \item{\code{feature()} }{returns the list entry of the \code{feat_list} slot for the requested feature.}
#'   \item{\code{corpusReadability()} }{returns the list of \code{kRp.readability} objects, see \code{\link[koRpus:readability]{readability}}.}
#'   \item{\code{corpusHyphen()} }{returns the list of \code{kRp.hyphen} objects, see \code{\link[koRpus:hyphen]{hyphen}}.}
#'   \item{\code{corpusLexDiv()} }{returns the list of \code{kRp.TTR} objects, see \code{\link[koRpus:lex.div]{lex.div}}.}
#'   \item{\code{corpusFreq()} }{returns the frequency analysis data from the \code{feat_list} slot, see \code{\link[koRpus:freq.analysis]{freq.analysis}}.}
#'   \item{\code{corpusCorpFreq()} }{returns the \code{kRp.corp.freq} object of the \code{feat_list} slot, see for example \code{\link[koRpus:read.corp.custom]{read.corp.custom}}.}
#'   \item{\code{corpusStopwords()} }{returns the number of stopwords found in each text (if analyzed) from the \code{feat_list} slot.}
#'   \item{\code{tif_as_tokens_df} }{returns the \code{tokens} slot in a TIF[1] compliant format, i.e., \code{doc_id} is not a factor but a character vector.}
#'   \item{\code{originalText()} }{similar to \code{taggedText()}, but reverts any transformations back to the original text before returning the \code{tokens} slot.
#'      Only works if the object has the feature \code{diff}, see examples.}
#'   \item{\code{diffText()} }{returns the \code{diff} slot, if present.}
#' }
#'
#' @param add.desc Logical, determines whether the \code{desc} column should be re-written with descriptions
#'    for all POS tags.
#' @param doc_id Logical (except for \code{fixObject}, \code{feature}, and \code{[[/[[<-}), if \code{TRUE} the \code{doc_id} column will be a factor with the respective value
#'    of the \code{desc} slot, i.\,e., the document ID will be preserved in the data.frame. If used with \code{fixObject}, can be a character string
#'    to set the document ID manually (the default \code{NA} will preserve existing values and not overwrite them). If used with \code{feature} or \code{[[/[[<-},
#'    a character vector to limit the scope to one or more particular document IDs.
#' @param ... Additional arguments for the generics.
#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
#' @references
#'    [1] Text Interchange Formats (\url{https://github.com/ropensci/tif})
#' @example inst/examples/if_lang_en_clause_start.R
#' @example inst/examples/define_sample_file.R
#' @examples
#'   tokenized.obj <- tokenize(
#'     txt=sample_file,
#'     lang="en"
#'   )
#'
#'   doc_id(tokenized.obj)
#'
#'   describe(tokenized.obj)
#'
#'   language(tokenized.obj)
#'
#'   taggedText(tokenized.obj)
#'   tokenized.obj[["token"]]
#'   tokenized.obj[1:3, "token"]
#'
#'   tif_as_tokens_df(tokenized.obj)
#'
#'   # example for originalText()
#'   tokenized.obj <- jumbleWords(tokenized.obj)
#'   # now compare the jumbled words to the original
#'   tokenized.obj[["token"]]
#'   originalText(tokenized.obj)[["token"]]
#' @example inst/examples/if_lang_en_clause_end.R
setGeneric("taggedText", function(obj, add.desc=FALSE, doc_id=FALSE) standardGeneric("taggedText"))

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    taggedText,-methods
#'    taggedText,kRp.text-method
#' @include koRpus-internal.R
setMethod("taggedText",
  signature=signature(obj="kRp.text"),
  function (obj, add.desc=FALSE, doc_id=FALSE){
    result <- slot(obj, name="tokens")
    if(isTRUE(add.desc)){
      result[["desc"]] <- explain_tags(
        tags=result[["tag"]],
        lang=language(obj),
        cols="desc"
      )
    } else {}
    if(isTRUE(doc_id)){
      result[["doc_id"]] <- as.factor(describe(obj)[["doc_id"]])
    } else {}
    return(result)
  }
)


#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @param value The new value to replace the current with.
setGeneric("taggedText<-", function(obj, value) standardGeneric("taggedText<-"))

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    taggedText<-,-methods
#'    taggedText<-,kRp.text-method
setMethod("taggedText<-",
  signature=signature(obj="kRp.text"),
  function (obj, value){
    slot(obj, name="tokens") <- value
    return(obj)
  }
)


#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
setGeneric("doc_id", function(obj, ...) standardGeneric("doc_id"))

#' @rdname kRp.text_get-methods
#' @param has_id A character vector with \code{doc_id}s to look for in the object. The return value
#'    is then a logical vector of the same length, indicating if the respective id was found or not.
#' @export
#' @docType methods
#' @aliases
#'    doc_id,-methods
#'    doc_id,kRp.text-method
setMethod("doc_id",
  signature=signature(obj="kRp.text"),
  function (obj, has_id=NULL){
    result <- unique(as.character(slot(obj, name="tokens")[["doc_id"]]))
    if(is.null(has_id)){
      return(result)
    } else if(length(has_id) > 1) {
      return(has_id %in% result)
    } else {
      return(any(result == has_id))
    }
  }
)


#' @rdname kRp.text_get-methods
#' @docType methods
#' @param feature Character string naming the feature to look for. The return value is logical if a single feature
#'    name is given. If \code{feature=NULL}, a character vector is returned, naming all features found in the object.
#' @export
setGeneric("hasFeature", function(obj, feature=NULL, ...) standardGeneric("hasFeature"))

#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
#' @aliases
#'    hasFeature,-methods
#'    hasFeature,kRp.text-method
setMethod("hasFeature",
  signature=signature(obj="kRp.text"),
  function (obj, feature=NULL){
    if(is.null(feature)){
      features <- slot(obj, "features")
      return(names(features[features]))
    } else {
      return(isTRUE(slot(obj, name="features")[feature]))
    }
  }
)


#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
setGeneric("hasFeature<-", function(obj, feature, value) standardGeneric("hasFeature<-"))

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    hasFeature<-,-methods
#'    hasFeature<-,kRp.text-method
setMethod("hasFeature<-",
  signature=signature(obj="kRp.text"),
  function (obj, feature, value){
    if(!is.logical(value)){
      stop(simpleError("The \"feature\" value must be logical!"))
    } else {}
    if(isTRUE(value)){
      slot(obj, name="features")[feature] <- value
    } else {
      current_features <- slot(obj, name="features")
      slot(obj, name="features") <- current_features[!names(current_features) %in% feature]
    }
    return(obj)
  }
)


#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
setGeneric("feature", function(obj, feature, ...) standardGeneric("feature"))

#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
#' @aliases
#'    feature,-methods
#'    feature,kRp.text-method
setMethod("feature",
  signature=signature(obj="kRp.text"),
  function (obj, feature, doc_id=NULL){
    if(is.null(doc_id)){
      return(slot(obj, name="feat_list")[[feature]])
    } else {
      doc_ids_in_obj <- doc_id(obj, has_id=doc_id)
      if(all(doc_ids_in_obj)){
        return(slot(obj, name="feat_list")[[feature]][doc_id])
      } else {
        warning(
          paste0("Invalid doc_id, omitted:\n  \"", paste0(doc_id[!doc_ids_in_obj], collapse="\", \""), "\""),
          call.=FALSE
        )
        return(slot(obj, name="feat_list")[[feature]][doc_id[doc_ids_in_obj]])
      }
    }
  }
)


#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
# @param value The new value to replace the current with.
setGeneric("feature<-", function(obj, feature, value) standardGeneric("feature<-"))

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    feature<-,-methods
#'    feature<-,kRp.text-method
setMethod("feature<-",
  signature=signature(obj="kRp.text"),
  function (obj, feature, value){
    slot(obj, name="feat_list")[[feature]] <- value
    if(is.null(value)){
      hasFeature(obj, feature) <- FALSE
    } else {
      hasFeature(obj, feature) <- TRUE
    }
    return(obj)
  }
)


#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
setGeneric("corpusReadability", function(obj, ...) standardGeneric("corpusReadability"))

#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
#' @aliases
#'    corpusReadability,-methods
#'    corpusReadability,kRp.text-method
setMethod("corpusReadability",
  signature=signature(obj="kRp.text"),
  function (obj, doc_id=NULL){
    return(feature(obj, "readability", doc_id=doc_id))
  }
)

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
# @param value The new value to replace the current with.
setGeneric("corpusReadability<-", function(obj, value) standardGeneric("corpusReadability<-"))

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    corpusReadability<-,-methods
#'    corpusReadability<-,kRp.text-method
setMethod("corpusReadability<-",
  signature=signature(obj="kRp.text"),
  function (obj, value){
    feature(obj, "readability") <- value
    return(obj)
  }
)


#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
setGeneric("corpusHyphen", function(obj, ...) standardGeneric("corpusHyphen"))

#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
#' @aliases
#'    corpusHyphen,-methods
#'    corpusHyphen,kRp.text-method
setMethod("corpusHyphen",
  signature=signature(obj="kRp.text"),
  function (obj, doc_id=NULL){
    return(feature(obj, "hyphen", doc_id=doc_id))
  }
)

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
# @param value The new value to replace the current with.
setGeneric("corpusHyphen<-", function(obj, value) standardGeneric("corpusHyphen<-"))

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    corpusHyphen<-,-methods
#'    corpusHyphen<-,kRp.text-method
setMethod("corpusHyphen<-",
  signature=signature(obj="kRp.text"),
  function (obj, value){
    feature(obj, "hyphen") <- value
    return(obj)
  }
)


#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
setGeneric("corpusLexDiv", function(obj, ...) standardGeneric("corpusLexDiv"))

#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
#' @aliases
#'    corpusLexDiv,-methods
#'    corpusLexDiv,kRp.text-method
setMethod("corpusLexDiv",
  signature=signature(obj="kRp.text"),
  function (obj, doc_id=NULL){
    return(feature(obj, "lex_div", doc_id=doc_id))
  }
)

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
# @param value The new value to replace the current with.
setGeneric("corpusLexDiv<-", function(obj, value) standardGeneric("corpusLexDiv<-"))

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    corpusLexDiv<-,-methods
#'    corpusLexDiv<-,kRp.text-method
setMethod("corpusLexDiv<-",
  signature=signature(obj="kRp.text"),
  function (obj, value){
    feature(obj, "lex_div") <- value
    return(obj)
  }
)


#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
setGeneric("corpusFreq", function(obj, ...) standardGeneric("corpusFreq"))

#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
#' @aliases
#'    corpusFreq,-methods
#'    corpusFreq,kRp.text-method
setMethod("corpusFreq",
  signature=signature(obj="kRp.text"),
  function (obj){
    return(feature(obj, "freq"))
  }
)


#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
# @param value The new value to replace the current with.
setGeneric("corpusFreq<-", function(obj, value) standardGeneric("corpusFreq<-"))

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    corpusFreq<-,-methods
#'    corpusFreq<-,kRp.text-method
setMethod("corpusFreq<-",
  signature=signature(obj="kRp.text"),
  function (obj, value){
    feature(obj, "freq") <- value
    return(obj)
  }
)


#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
setGeneric("corpusCorpFreq", function(obj, ...) standardGeneric("corpusCorpFreq"))

#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
#' @aliases
#'    corpusCorpFreq,-methods
#'    corpusCorpFreq,kRp.text-method
setMethod("corpusCorpFreq",
  signature=signature(obj="kRp.text"),
  function (obj){
    return(feature(obj, "corp_freq"))
  }
)


#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
# @param value The new value to replace the current with.
setGeneric("corpusCorpFreq<-", function(obj, value) standardGeneric("corpusCorpFreq<-"))

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    corpusCorpFreq<-,-methods
#'    corpusCorpFreq<-,kRp.text-method
setMethod("corpusCorpFreq<-",
  signature=signature(obj="kRp.text"),
  function (obj, value){
    feature(obj, "corp_freq") <- value
    return(obj)
  }
)


#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
setGeneric("corpusStopwords", function(obj, ...) standardGeneric("corpusStopwords"))

#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
#' @aliases
#'    corpusStopwords,-methods
#'    corpusStopwords,kRp.text-method
setMethod("corpusStopwords",
  signature=signature(obj="kRp.text"),
  function (obj){
    return(feature(obj, "stopwords"))
  }
)

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
# @param value The new value to replace the current with.
setGeneric("corpusStopwords<-", function(obj, value) standardGeneric("corpusStopwords<-"))

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    corpusStopwords<-,-methods
#'    corpusStopwords<-,kRp.text-method
setMethod("corpusStopwords<-",
  signature=signature(obj="kRp.text"),
  function (obj, value){
    feature(obj, "stopwords") <- value
    return(obj)
  }
)


#' @rdname kRp.text_get-methods
#' @param x An object of class \code{kRp.text} or \code{kRp.hyphen}.
#' @param i Defines the row selector (\code{[}) or the name to match (\code{[[}).
#' @param j Defines the column selector.
#' @param drop Logical, whether the result should be coerced to the lowest possible dimension. See \code{\link[base:Extract]{[}} for more details.
#' @export
#' @docType methods
#' @aliases
#'    [,-methods
#'    [,kRp.text,ANY,ANY-method
setMethod("[",
  signature=signature(x="kRp.text"),
  function (x, i, j, ...){
    return(taggedText(x)[i, j, ...])
  }
)

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    [<-,-methods
#'    [<-,kRp.text,ANY,ANY,ANY-method
setMethod("[<-",
  signature=signature(x="kRp.text"),
  function (x, i, j, ..., value){
    taggedText(x)[i, j, ...] <- value
    return(x)
  }
)

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    [[,-methods
#'    [[,kRp.text,ANY-method
setMethod("[[",
  signature=signature(x="kRp.text"),
  function (x, i, doc_id=NULL, ...){
    if(is.null(doc_id)){
      return(taggedText(x)[[i, ...]])
    } else {
      doc_ids_in_obj <- doc_id(x, has_id=doc_id)
      tt <- taggedText(x)
      if(all(doc_ids_in_obj)){
        return(tt[tt[["doc_id"]] %in% doc_id, i, ...])
      } else {
        warning(
          paste0("Invalid doc_id, omitted:\n  \"", paste0(doc_id[!doc_ids_in_obj], collapse="\", \""), "\""),
          call.=FALSE
        )
        return(tt[tt[["doc_id"]] %in% doc_id[doc_ids_in_obj], i, ...])
      }
    }
  }
)

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    [[<-,-methods
#'    [[<-,kRp.text,ANY,ANY-method
setMethod("[[<-",
  signature=signature(x="kRp.text"),
  function (x, i, doc_id=NULL, ..., value){
    if(is.null(doc_id)){
      taggedText(x)[[i]] <- value
    } else {
      doc_ids_in_obj <- doc_id(x, has_id=doc_id)
      tt <- taggedText(x)
      if(all(doc_ids_in_obj)){
        tt[tt[["doc_id"]] %in% doc_id, i, ...] <- value
        taggedText(x) <- tt
      } else {
        stop(simpleError(
          paste0("Invalid doc_id:\n  \"", paste0(doc_id[!doc_ids_in_obj], collapse="\", \""), "\"!")
        ))
      }
    }
    return(x)
  }
)

## the standard generic for describe() is defined in the sylly package
#' @importFrom sylly describe
#' @param simplify Logical, if \code{TRUE} and the result is a list oft length one (i.e., just a single \code{doc_id}),
#'    returns the contents of the single list entry.
#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    describe,-methods
#'    describe,kRp.text-method
setMethod("describe",
  signature=signature(obj="kRp.text"),
  function (obj, doc_id=NULL, simplify=TRUE, ...){
    result <- slot(obj, name="desc")
    if(!is.null(doc_id)){
      doc_ids_in_obj <- doc_id(obj, has_id=doc_id)
      if(all(doc_ids_in_obj)){
        result <- result[names(result) %in% doc_id]
      } else {
        stop(simpleError(
          paste0("Invalid doc_id:\n  \"", paste0(doc_id[!doc_ids_in_obj], collapse="\", \""), "\"!")
        ))
      }
    } else {}

    if(all(isTRUE(simplify), length(result) == 1)){
      result <- result[[1]]
    } else {}
    
    return(result)
  }
)

## the standard generic for describe()<- is defined in the sylly package
#' @importFrom sylly describe<-
#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    describe<-,-methods
#'    describe<-,kRp.text-method
setMethod("describe<-",
  signature=signature(obj="kRp.text"),
  function (obj, doc_id=NULL, ..., value){
    if(is.null(doc_id)){
      slot(obj, name="desc") <- value
    } else {
      doc_ids_in_obj <- doc_id(obj, has_id=doc_id)
      if(all(doc_ids_in_obj)){
        if(length(doc_id) > 1){
          slot(obj, name="desc")[doc_id] <- value
        } else {
          slot(obj, name="desc")[[doc_id]] <- value
        }
      } else {
        stop(simpleError(
          paste0("Invalid doc_id:\n  \"", paste0(doc_id[!doc_ids_in_obj], collapse="\", \""), "\"!")
        ))
      }
    }
    return(obj)
  }
)

## the standard generic for language() is defined in the sylly package
#' @importFrom sylly language
#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    language,-methods
#'    language,kRp.text-method
setMethod("language",
  signature=signature(obj="kRp.text"),
  function (obj){
    result <- slot(obj, name="lang")
    return(result)
  }
)

## the standard generic for language()<- is defined in the sylly package
#' @importFrom sylly language<-
#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    language<-,-methods
#'    language<-,kRp.text-method
setMethod("language<-",
  signature=signature(obj="kRp.text"),
  function (obj, value){
    slot(obj, name="lang") <- value
    return(obj)
  }
)


#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
setGeneric("diffText", function(obj, doc_id=NULL) standardGeneric("diffText"))

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    diffText,-methods
#'    diffText,kRp.text-method
setMethod("diffText",
  signature=signature(obj="kRp.text"),
  function (obj, doc_id=NULL){
    if(hasFeature(obj, "diff")){
      result <- feature(obj, "diff", doc_id=doc_id)
      return(result)
    } else {
      warning("There is no feature \"diff\" in this object!")
      return(invisible(NULL))
    }
  }
)

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
setGeneric("diffText<-", function(obj, value) standardGeneric("diffText<-"))

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    diffText<-,-methods
#'    diffText<-,kRp.text-method
setMethod("diffText<-",
  signature=signature(obj="kRp.text"),
  function (obj, value){
    feature(obj, "diff") <- value
    return(obj)
  }
)


#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
setGeneric("originalText", function(obj) standardGeneric("originalText"))

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    originalText,-methods
#'    originalText,kRp.text-method
setMethod("originalText",
  signature=signature(obj="kRp.text"),
  function (obj){
    return(txt_trans_revert_orig(tokens=taggedText(obj)))
  }
)


#' @param obj An arbitrary \code{R} object.
#' @rdname kRp.text_get-methods
#' @export
is.taggedText <- function(obj){
  inherits(obj, "kRp.text")
}

#' @rdname kRp.text_get-methods
#' @export
is.kRp.text <- function(obj){
  inherits(obj, "kRp.text")
}


#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
setGeneric("fixObject", function(obj, doc_id=NA) standardGeneric("fixObject"))

#' @rdname kRp.text_get-methods
#' @export
#' @docType methods
#' @aliases
#'    fixObject,-methods
#'    fixObject,kRp.text-method
setMethod("fixObject",
  signature=signature(obj="kRp.text"),
  function (obj, doc_id=NA){
    currentDf <- slot(obj, "tokens")
    currentDesc <- slot(obj, "desc")
    currentCols <- colnames(currentDf)
    newDf <- init.kRp.text.df(rows=nrow(currentDf))
    # move all present columns to the new data.frame
    newDf[,currentCols] <- currentDf[,currentCols]
    # adjust column classes where needed

    lang <- slot(obj, "lang")
    tag.class.def <- kRp.POS.tags(lang)
    for (thisCol in c("tag","wclass","desc")){
      if(
        all(
          !is.factor(newDf[[thisCol]]),
          any(
            !thisCol %in% "desc",
            !all(is.na(newDf[[thisCol]]))
          )
        )
      ){
        # make tag a factor with all possible tags for this language as levels
        newDf[[thisCol]] <- factor(
          newDf[[thisCol]],
          levels=unique(tag.class.def[,thisCol])
        )
      } else {}
    }
    newDf <- indexSentenceDoc(newDf, lang=lang, doc_id=doc_id)

    # fix desc slot
    if(any(!"doc_id" %in% names(currentDesc), !is.na(doc_id))){
      currentDesc[["doc_id"]] <- doc_id
    } else {}

    taggedText(obj) <- newDf

    newDesc <- list(currentDesc)
    if(!is.na(currentDesc[["doc_id"]])){
      names(newDesc) <- currentDesc[["doc_id"]]
    }
    describe(obj) <- newDesc

    return(obj)
  }
)

#' @rdname kRp.text_get-methods
#' @docType methods
#' @export
setGeneric("tif_as_tokens_df", function(tokens) standardGeneric("tif_as_tokens_df"))

#' @rdname kRp.text_get-methods
#' @param tokens An object of class \code{\link[koRpus:kRp.text-class]{kRp.text}}.
#' @export
#' @docType methods
#' @aliases
#'    tif_as_tokens_df,-methods
#'    tif_as_tokens_df,kRp.text-method
setMethod("tif_as_tokens_df",
  signature=signature(tokens="kRp.text"),
  function(tokens){
    result <- taggedText(tokens)
    # TIF needs doc_id to be a character vector, not a factor
    result[["doc_id"]] <- as.character(result[["doc_id"]])
    return(result)
  }
)

Try the koRpus package in your browser

Any scripts or data that you put into this service are public.

koRpus documentation built on May 18, 2021, 1:13 a.m.