R/02_method_cTest.R
In koRpus: Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

# Copyright 2010-2021 Meik Michalke <meik.michalke@hhu.de>
#
# This file is part of the R package koRpus.
#
# koRpus is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# koRpus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with koRpus.  If not, see <http://www.gnu.org/licenses/>.


#' Transform text into C-Test-like format
#' 
#' If you feed a tagged text object to this function, its text will be transformed into
#' a format used for C-Tests:
#' \itemize{
#'    \item{the first and last sentence will be left untouched (except if the \code{start}
#'      and \code{stop} values of the \code{intact} parameter are changed}
#'    \item{of all other sentences, the second half of every 2nd word (or as specified by
#'      \code{every}) will be replaced by a line}
#'    \item{words must have at least \code{min.length} characters, otherwise they are
#'      skipped}
#'    \item{words an uneven number of characters will be replaced after the next character,
#'      i.e., a word with five characters will keep the first three and have the last two
#'      replaced}
#' }
#'
#' @export
#' @param ... Additional arguments to the method (as described in this document).
#' @docType methods
#' @return An object of class \code{\link[koRpus:kRp.text-class]{kRp.text}} with the added feature \code{diff}.
#' @rdname cTest-methods
#' @example inst/examples/if_lang_en_clause_start.R
#' @example inst/examples/define_sample_file.R
#' @examples
#'   tokenized.obj <- tokenize(
#'     txt=sample_file,
#'     lang="en"
#'   )
#'   tokenized.obj <- cTest(tokenized.obj)
#'   pasteText(tokenized.obj)
#'
#'   # diff stats are now part of the object
#'   hasFeature(tokenized.obj)
#'   diffText(tokenized.obj)
#' @example inst/examples/if_lang_en_clause_end.R
setGeneric("cTest", function(obj, ...){standardGeneric("cTest")})

#### internal function 
## function cTestify()
# replaces the second half of a word with undercores
cTestify <- function(words, replace.by="_"){
  num.chars <- nchar(words, type="width")
  half.words <- ifelse(num.chars %% 2 == 0, num.chars / 2, (num.chars+1) / 2)
  word.rest <- sapply(seq_along(num.chars), function(idx){
      return(paste(rep(replace.by, num.chars[idx]-half.words[idx]), collapse=""))
    })
  substr(words, start=half.words+1, stop=num.chars) <- word.rest
  return(words)
} ## end function cTestify()

#' @export
#' @docType methods
#' @rdname cTest-methods
#' @aliases cTest,kRp.text-method
#' @param obj An object of class \code{\link[koRpus:kRp.text-class]{kRp.text}}.
#' @param every Integer numeric, setting the frequency of words to be manipulated. By default,
#'    every other word is being transformed.
#' @param min.length Integer numeric, sets the minimum length of words to be considered (in letters).
#' @param intact Named vector with the elements \code{start} and \code{end}. both must be integer values
#'    and define, which sentences are to be left untouched, counted in sentences from beginning and end of the text.
#'    The default is to ignore the first and last sentence.
#' @param replace.by Character, will be used as the replacement for the removed word halves.
#' @include 01_class_01_kRp.text.R
setMethod("cTest",
    signature(obj="kRp.text"),
    function (obj, every=2, min.length=3, intact=c(start=1, end=1), replace.by="_"){
    # check start and end values to leave intact
    start <- ifelse("start" %in% names(intact), intact[["start"]], 1)
    end <- ifelse("end" %in% names(intact), intact[["end"]], 1)
    stopifnot(start >= 0 && end >= 0)

    lang <- language(obj)
    tagged.text <- taggedText(obj)
    # find out where the sentences end
    sntc.tags <- kRp.POS.tags(lang=lang, list.tags=TRUE, tags="sentc")
    txt.sntc.ends <- which(tagged.text[["tag"]] %in% sntc.tags)
    # in case the last sentence is unfinished, i.e., does not have sentence ending punctuation,
    # count it as an extra sentence anyway, to not end up with a mismatch of returned results
    if(nrow(tagged.text) > txt.sntc.ends[length(txt.sntc.ends)]){
      txt.sntc.ends <- c(txt.sntc.ends, nrow(tagged.text))
    } else {}
    # make a list of the sentences, i.e., each element is a data.frame of one sentence
    txt.sntc.list <- lapply(seq_along(txt.sntc.ends), function(idx){
        txt.from <- ifelse(idx == 1, 1, txt.sntc.ends[idx-1]+1)
        txt.to <- txt.sntc.ends[idx]
        return(tagged.text[txt.from:txt.to,])
      })
    num.sntc <- length(txt.sntc.list)

    # now do the actual text alterations
    word.tags <- kRp.POS.tags(lang=lang, list.tags=TRUE, tags="words")
    txt.cTestified <- list()
    for(idx in 1:num.sntc){
        this.sntc <- txt.sntc.list[[idx]]
        # check if this sentence needs manipulation at all
        if(any(idx <= start, idx > (num.sntc - end))){
        } else {
          # we'll only care for actual words with min.length
          txtIsWord <- this.sntc[["tag"]] %in% word.tags
          txtMinChars <- this.sntc[["lttr"]] >= min.length
          txtToChange <- txtIsWord & txtMinChars

          # now only take every other word (like indicated by "every")
          txtToChangeTRUE <- which(txtToChange)
          txtToChange[txtToChangeTRUE[!seq_along(txtToChangeTRUE) %% every == 0]] <- FALSE
          relevant.text <- this.sntc[txtToChange, "token"]
          relevant.text <- cTestify(relevant.text, replace.by=replace.by)
          this.sntc[txtToChange, "token"] <- relevant.text
        }
        txt.cTestified[[idx]] <- this.sntc
      }

    # hm, this is a list now, make a data.frame again
    result.DF <- txt.cTestified[[1]]
    for(this.sntc in txt.cTestified[-1]){
      result.DF <- rbind(result.DF, this.sntc)
    }

    results <- txt_trans_diff(obj=obj, tokens.new=result.DF[["token"]], transfmt="cTest")
    return(results)
  }
)

Any scripts or data that you put into this service are public.

koRpus documentation built on May 18, 2021, 1:13 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

koRpus
Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

R/02_method_cTest.R
In koRpus: Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

Defines functions cTestify

Try the koRpus package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

koRpus Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

R/02_method_cTest.R In koRpus: Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

Defines functions cTestify

Try the koRpus package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

koRpus
Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

R/02_method_cTest.R
In koRpus: Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity