R/koRpus-internal.R
In koRpus: Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

# Copyright 2010-2021 Meik Michalke <meik.michalke@hhu.de>
#
# This file is part of the R package koRpus.
#
# koRpus is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# koRpus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with koRpus.  If not, see <http://www.gnu.org/licenses/>.


# these are internal functions that are being called by some of the methods of koRpus
# they are not exported, hence not to be called by users themselves
# and are therefore only documented by the comments in this file.

## function check.file()
# helper function for file checks
check.file <- function(filename, mode="exist", stopOnFail=TRUE){

  ret.value <- FALSE

  if(any(is.na(filename), is.null(filename))){
    return(FALSE)
  } else {
    if(identical(mode, "exist") | identical(mode, "exec")){
      if(as.logical(file_test("-f", filename))){
        ret.value <- TRUE
      } else {
        if(isTRUE(stopOnFail)){
          stop(simpleError(paste("Specified file cannot be found:\n", filename)))
        } else {}
        ret.value <- FALSE
      }
    } else {}

    if(identical(mode, "exec")){
      if(as.logical(file_test("-x", filename))){
        ret.value <- TRUE
      } else {
        if(isTRUE(stopOnFail)){
          stop(simpleError(paste("Specified file cannot be executed:\n", filename)))
        } else {}
        ret.value <- FALSE
      }
    } else {}

    if(identical(mode, "dir")){
      if(as.logical(file_test("-d", filename))){
        ret.value <- TRUE
      } else {
        if(isTRUE(stopOnFail)){
          stop(simpleError(paste("Specified directory cannot be found:\n", filename)))
        } else {}
        ret.value <- FALSE
      }
    } else {}
  }

  return(ret.value)
} ## end function check.file()


## function tag.kRp.txt()
# this function takes normal text OR objects of koRpus classes which carry the
# original information of the analyzed text somewhere, and
# tries to return a valid object of class kRp.text instead
# '...' will be passed through to treetag() or tokenize()
tag.kRp.txt <- function(txt, tagger=NULL, lang, objects.only=TRUE, ...){

  if(inherits(txt, "kRp.text")){
    return(as(txt, "kRp.text"))
  } else {}

  if(isTRUE(objects.only)){
    stop(simpleError(paste("Not a valid class for tag.kRp.txt():", class(txt)[1])))
  } else {
    if(is.character(txt)){
      # set the language definition
      if(is.null(lang)){
        lang <- get.kRp.env(lang=TRUE)
      } else {
        ## TODO: add some validation here
      }
      internal.tokenizer <- FALSE
      # set a fallback if tagger isn't set but objects.only=FALSE
      if(is.null(tagger)){
        if(is.null(get.kRp.env(TT.cmd=TRUE, errorIfUnset=FALSE))){
          message("No TreeTagger command specified. Using tokenize() as fallback")
          internal.tokenizer <- TRUE
        } else {
          tagger <- "kRp.env"
        }
      } else {}
      if(identical(tagger, "kRp.env")){
        if(identical(get.kRp.env(TT.cmd=TRUE), "tokenize")){
          internal.tokenizer <- TRUE
        } else {}
      } else {}
      if(identical(tagger, "tokenize") | isTRUE(internal.tokenizer)){
        tagged.txt <- tokenize(txt, tag=TRUE, lang=lang, ...)
      } else {
        tagged.txt <- treetag(txt, treetagger=tagger, lang=lang, ...)
      }
      return(tagged.txt)
    } else {
      stop(simpleError("Text object is neither of class kRp.text nor character!"))
    }
  }
} ## end function tag.kRp.txt()


## function basic.text.descriptives()
# txt must be a character vector
basic.text.descriptives <- function(txt){
  # check for newline characters which might cause trouble
  if(any(grepl("\n", txt))){
      txt <- unlist(strsplit(txt, "\n"))
  } else {}
  # number of characters, including spaces and punctuation
  # the following vector counts chars per line
  vct_all_chars <- nchar(txt, type="width")
  num_lines <- length(vct_all_chars)
  all_chars_no_newline <- sum(vct_all_chars)
  all_chars_incl_newline <- all_chars_no_newline + num_lines

  # split txt into individual characters
  # note this mighthave a different length than vct_all_chars + num_lines
  # because of grapheme clusters, see ChangeLog for 0.11-5
  txt_all_split <- unlist(strsplit(txt, ""))
  spaces <- grepl("[[:space:]]", txt_all_split)
  num_all_spaces <- sum(spaces)
  num_normalized_spaces <- sum(rle(spaces)[["values"]])
  # count without any spaces
  nospace_chars <- all_chars_no_newline - num_all_spaces
  # count punctuation
  num_punct <- sum(grepl("[^\\p{L}\\p{M}\\p{N}[:space:]]", txt_all_split, perl=TRUE))
  # count digits
  num_digits <- sum(grepl("[\\p{N}]", txt_all_split, perl=TRUE))
  # no spaces, punctuation or digits
  only_letters <- nospace_chars - num_punct - num_digits

  # count each cluster of spaces as only one space
  onespace_chars <- nchar(
    gsub("[[:space:]]{2,}", " ", paste(txt, collapse=" ")),
    type="width"
  )

  results <- list(
    all.chars=all_chars_incl_newline,
    lines=num_lines,
    normalized.space=onespace_chars,
    chars.no.space=nospace_chars,
    punct=num_punct,
    digits=num_digits,
    letters=only_letters
  )
  return(results)
} ## end function basic.text.descriptives()


## function basic.tagged.descriptives()
# txt must be an object of class kRp.text
#  - if "txt.vector" is given alone, an all new desc slot is prepared
#  - if "desc" is given alone, some of its content is being updated by examining "txt"
# using "txt.vector" for re-calculations of lines, all.chars or normalized.space is most definitely a bad idea
# because the vector lost all information on spaces between tokens or newlines. therefore:
#  - if *both* "txt.vector" and "desc" are provided and "update.desc=TRUE", as is done by textTransform(),
#    the old value of lines is left untouched, and all.chars and normalized.space are adjusted according to
#    the difference between the old and new value of chars.no.space
#' @include 02_method_filterByClass.R
basic.tagged.descriptives <- function(txt, lang=NULL, desc=NULL, txt.vector=NULL, update.desc=FALSE, doc_id=NA){
  extended_update <- FALSE
  
  if(is.null(lang)){
    lang <- language(txt)
  } else {}

  if(all(!is.null(txt.vector), !is.null(desc), isTRUE(update.desc))){
    desc.old <- desc
    desc <- basic.text.descriptives(txt.vector)
    extended_update <- TRUE
  } else if(all(!is.null(txt.vector), is.null(desc))){
    # create desc if not present
    desc <- basic.text.descriptives(txt.vector)
  } else {}

  # count sentences
  txt.stend.tags <- kRp.POS.tags(lang, list.tags=TRUE, tags="sentc")
  txt.stend <- count.sentences(taggedText(txt), txt.stend.tags)
  # count words
  txt.nopunct <- filterByClass(txt, corp.rm.class="nonpunct", corp.rm.tag=c(), as.vector=FALSE, update.desc=NULL)
  num.words <- nrow(taggedText(txt.nopunct))
  avg.sentc.length <- num.words / txt.stend
  # character distribution
  char.distrib <- value.distribs(taggedText(txt)[["lttr"]], omit.missings=FALSE)
  lttr.distrib <- value.distribs(taggedText(txt.nopunct)[["lttr"]], omit.missings=FALSE)

  # txt.desc$letters had all digits removed
  # we'll use these numbers as they are usually more exact than relying on correct tokenization
   if("letters.only" %in% names(desc)){
     num.letters <- desc[["letters.only"]] + desc[["digits"]]
   } else {
    num.letters <- desc[["letters"]] + desc[["digits"]]
    # for readability calculations
    desc[["letters.only"]] <- desc[["letters"]]
    desc[["letters"]] <- distrib.to.fixed(lttr.distrib, all.values=num.letters, idx="l")
   }
  avg.word.length <- num.letters / num.words

  if(isTRUE(update.desc)){
    desc[["char.distrib"]] <- char.distrib
    desc[["lttr.distrib"]] <- lttr.distrib
    desc[["words"]] <- num.words
    desc[["sentences"]] <- txt.stend
    desc[["avg.sentc.length"]] <- avg.sentc.length
    desc[["avg.word.length"]] <- avg.word.length
    if(isTRUE(extended_update)){
      desc[["lines"]] <- desc.old[["lines"]]
      chars.diff <- desc[["chars.no.space"]] - desc.old[["chars.no.space"]]
      desc[["all.chars"]] <- desc.old[["all.chars"]] + chars.diff
      desc[["normalized.space"]] <- desc.old[["normalized.space"]] + chars.diff
    } else {}
    results <- desc
  } else {
    results <- append(
      desc,
      list(
        char.distrib=char.distrib,
        lttr.distrib=lttr.distrib,
        words=num.words,
        sentences=txt.stend,
        avg.sentc.length=avg.sentc.length,
        avg.word.length=avg.word.length
      )
    )
  }
  results[["doc_id"]] <- doc_id
  return(results)
} ## end function basic.tagged.descriptives()


## function clean.text()
# takes a character vector and a named list. it replaces each occurance
# of the list names by ist value and returns the changed vector
# e.g., use list("\"  "="\"") to remove two spaces behind double quotes
clean.text <- function(txt.vct, from.to=NULL, perl=FALSE){
  if(is.null(from.to)){
    return(txt.vct)
  } else {}
  stopifnot(is.character(txt.vct))
  stopifnot(is.list(from.to))
  for (idx in seq_along(from.to)){
      from <- names(from.to)[[idx]]
      to <- from.to[[idx]]
      txt.vct <- gsub(from, to, txt.vct, perl=perl)
      rm("from", "to")
    }
  return(txt.vct)
} ## end function clean.text()


## function language.setting()
language.setting <- function(tagged.object, force.lang){
  stopifnot(inherits(tagged.object, "kRp.text"))
  if(is.null(force.lang)){
    lang <- tagged.object@lang
  } else {
    lang <- is.supported.lang(lang.ident=force.lang)
  }
  return(lang)
} ## end function language.setting()


## function validate_tags()
# takes a character vector of POS tags and a language identifier
# checks all tags for validity
validate_tags <- function(tags, lang){
  # get all valid tags
  tag.class.def <- kRp.POS.tags(lang)

  # only proceed if all tag values are valid
  all.found.tags <- unique(tags)
  invalid.found.tags <- all.found.tags[!all.found.tags %in% tag.class.def[,"tag"]]
  if(length(invalid.found.tags) > 0){
    warning(paste0("Invalid tag(s) found: ", paste(invalid.found.tags, collapse = ", "),
      "\n  This is probably due to a missing tag in kRp.POS.tags() and",
      "\n  needs to be fixed. It would be nice if you could forward the",
      "\n  above warning dump as a bug report to the package maintaner!\n"), 
      call.=FALSE
    )
  } else {}
  # return a logical vector
  result <- !tags %in% invalid.found.tags
  return(result)
}
## end function validate_tags()


## function explain_tags()
# takes a character vector of POS tags and a language identifier
# returns either an equivalent character vector with explainations of each tag,
# or a matrix if 'cols' is > 1
explain_tags <- function(tags, lang, cols=c("wclass","desc"), valid=rep(TRUE, length(tags))){
  # get all valid tags
  tag.class.def <- kRp.POS.tags(lang)
  tags_explained <- sapply(
    seq_along(tags),
    function(tagNum){
      if(isTRUE(valid[tagNum])){
        return(tag.class.def[tag.class.def[,"tag"] == tags[tagNum], cols])
      } else {
        return(c(wclass="unknown",desc="Unknown (kRp internal)")[cols])
      }
    },
    USE.NAMES=FALSE
  )
  if(length(cols) > 1){
   tags_explained <- as.matrix(t(tags_explained))
  } else {
   tags_explained <- as.character(tags_explained)
  }
  return(tags_explained)
}
## end function explain_tags()


## function treetag.com()
treetag.com <- function(tagged.text, lang, add.desc=TRUE){
  # initialize empty target data.frame to define the order of columns
  newDf <- init.kRp.text.df(rows=nrow(tagged.text))
  newDf[,c("token","lemma")] <- tagged.text[,c("token","lemma")]

  valid_tags <- validate_tags(tags=tagged.text[,"tag"], lang=lang)
  # get all valid tags
  tag.class.def <- kRp.POS.tags(lang)
 
  # make tag a factor with all possible tags for this language as levels
  newDf[["tag"]] <- factor(
    tagged.text[,"tag"],
    # keep invalid tags for debugging
    levels=unique(c(tag.class.def[,"tag"], tagged.text[!valid_tags,"tag"]))
  )
  # count number of letters, add column "lttr"
  newDf[["lttr"]] <- as.integer(nchar(tagged.text[,"token"], type="width"))

  # add further columns "wclass" and "desc"
  if(isTRUE(add.desc)){
    tagsExplained <- explain_tags(tags=tagged.text[,"tag"], lang=lang, cols=c("wclass","desc"), valid=valid_tags)
    tagsExplained.wclass <- factor(
      tagsExplained[,"wclass"],
      levels=unique(tag.class.def[,"wclass"])
    )
    tagsExplained.desc <- factor(
      tagsExplained[,"desc"],
      levels=unique(tag.class.def[,"desc"])
    )
  } else {
    tagsExplained.wclass <- factor(
      explain_tags(tags=tagged.text[,"tag"], lang=lang, cols="wclass", valid=valid_tags),
      levels=unique(tag.class.def[,"wclass"])
    )
    tagsExplained.desc <- NA
  }

  newDf[["wclass"]] <- tagsExplained.wclass
  newDf[["desc"]] <- tagsExplained.desc
    
  return(newDf)
} ## end function treetag.com()


## function dumpTextToTempfile()
# text: either plain text vector or a connection (where its content needs to be dumped to a file)
# encoding: output file encoding (will default to UTF-8 if NULL)
# pattern: tempfile name pattern
# fileext: tempfile name extension
dumpTextToTempfile <- function(text, encoding=NULL, pattern="tempTextFromObject", fileext=".txt"){
  conn.tempfile.path <- tempfile(pattern=pattern, fileext=fileext)
  # encoding can always become an issue; try to stick to UTF-8 if no other encoding was specified
  if(is.null(encoding)){
    conn.tempfile <- file(conn.tempfile.path, open="w", encoding="UTF-8")
  } else {
    conn.tempfile <- file(conn.tempfile.path, open="w", encoding=encoding)
  }
  on.exit(close(conn.tempfile))
  if(inherits(text, "connection")){
    writeLines(readLines(text, encoding=ifelse(is.null(encoding), "", encoding)), con=conn.tempfile)
  } else {
    writeLines(text, con=conn.tempfile)
  }
  return(conn.tempfile.path)
} ## end function dumpTextToTempfile()


## function stopAndStem()
# tagged.text is a data.frame from treetag() or tokenize(), to become "tokens"
stopAndStem <- function(tagged.text.df, stopwords=NULL, stemmer=NULL, lowercase=TRUE){

  if(!is.null(stopwords)){
    if(!is.character(stopwords)){
      stop(simpleError("Stopwords must be specified as a character vector!"))
    } else {}
    # treat all tokens and stopwords in lower case?
    if(isTRUE(lowercase)){
      this.token <- tolower(tagged.text.df[["token"]])
      stopwords <- tolower(stopwords)
    } else {
      this.token <- tagged.text.df[["token"]]
    }
    # check if token is a stopword, add column "lttr"
    tagged.text.df[["stop"]] <- this.token %in% stopwords
  } else {
    tagged.text.df[["stop"]] <- NA
  }

  # check for stemming
  if(inherits(stemmer, "R_Weka_stemmer_interface") || is.function(stemmer)){
    tagged.text.df[["stem"]] <- stemmer(tagged.text.df[,"token"])
  } else {
    tagged.text.df[["stem"]] <- NA
  }

  return(tagged.text.df)
} ## end function stopAndStem()


## function indexSentenceDoc()
# after stopAndStem(), add columns "idx", "sntc" and "doc_id" to data.frame
indexSentenceDoc <- function(tagged.text.df, lang, doc_id=NA){
  numTokens <- nrow(tagged.text.df)
  tagged.text.df[["idx"]] <- 1:numTokens
  sentenceEnding <- kRp.POS.tags(lang=lang, tags=c("sentc"), list.classes=TRUE)
  endedSentences <- which(tagged.text.df[["wclass"]] %in% sentenceEnding)
  if(length(endedSentences) > 0){
    # handle texts that don't end with a sentence ending
    ## TODO: be smarter here -- if the sentence is a quote, the closing quote comes *after* the fullstop
    ## we'll just ignore this for now!
    if(endedSentences[length(endedSentences)] < numTokens){
      endedSentences[length(endedSentences)] <- numTokens
    } else {}
    tagged.text.df[["sntc"]] <- unlist(sapply(
      seq_along(endedSentences),
      function(numSentence){
        if(numSentence > 1){
          return(rep(numSentence, endedSentences[numSentence] - endedSentences[numSentence - 1]))
        } else {
          return(rep(numSentence, endedSentences[numSentence]))
        }
      },
      simplify=FALSE
    ))
  } else {
    tagged.text.df[["sntc"]] <- NA
  }
  # keep a doc_id if present and new value would be NA
  if(!"doc_id" %in% colnames(tagged.text.df) | !is.na(doc_id)){
    tagged.text.df[["doc_id"]] <- factor(doc_id)
  } else if(length(unique(tagged.text.df[["doc_id"]])) > 1){
    # just check for sane doc_id
    stop(simpleError("You provided a doc_id column that included multiple IDs. This is not supported."))
  } else {}
  return(tagged.text.df)
} ## end function indexSentenceDoc()


## function tagged.txt.rm.classes()
# takes a tagged text object and returns it without punctuation or other defined
# classes or tags. can also return tokens in lemmatized form.
# 
# boolean: don't return the actual reduced data, but a logical vector indicating
#   which values (i.e. rows) would have been removed
# NOTE: "lemma" only takes effect if "as.vector=TRUE"!
tagged.txt.rm.classes <- function(txt, lemma=FALSE, lang, corp.rm.class, corp.rm.tag, as.vector=TRUE, boolean=FALSE){
  # to avoid needless NOTEs from R CMD check
  wclass <- tag <- rel.col <- NULL

  stopifnot(is.data.frame(txt))

  valid.tagset <- kRp.POS.tags(lang)
  txt.rm.tags <- c()
  if(identical(corp.rm.class, "nonpunct")){
    corp.rm.class <- kRp.POS.tags(lang, tags=c("punct", "sentc"), list.classes=TRUE)
  } else {}

  # "stopword" needs to be treated differently, it's another column
  if(any(corp.rm.class == "stopword")){
    if(all(is.na(txt[, "stop"]))){
      warning("Stopword removal not possible: All values are NA! Did you provide a stopword list when tokenizing?", call.=FALSE)
    } else {
      txt <- txt[!txt[["stop"]],]
    }
    # that's all we need -- remove the entry from the vector
    corp.rm.class <- corp.rm.class[!corp.rm.class %in% "stopword"]
  } else {}
  
  if(is.vector(corp.rm.class) && length(corp.rm.class) > 0){
    # only proceed if all class values are valid
    if(all(corp.rm.class %in% valid.tagset[, "wclass"])){
      txt.rm.tag.classes <- valid.tagset[valid.tagset[, "wclass"] %in% corp.rm.class, "tag"]
      txt.rm.tags <- unique(c(txt.rm.tags, txt.rm.tag.classes))
    } else {
      stop(simpleError("Invalid value in corp.rm.class!"))
    }
  } else {}

  if(is.vector(corp.rm.tag) && length(corp.rm.tag) > 0){
    # only proceed if all class values are valid
    if(all(corp.rm.tag %in% valid.tagset[, "tag"])){
    txt.rm.tags <- unique(c(txt.rm.tags, corp.rm.tag))
    } else {
    stop(simpleError("Invalid value in corp.rm.tag!"))
    }
  } else {}

  # in this vector, FALSE means "remove"
  rm.boolean <- !txt[["tag"]] %in% txt.rm.tags

  if(isTRUE(boolean)){
    return(rm.boolean)
  } else if(isTRUE(as.vector)){
    # return only a vetor with the tokens itself, or the whole object?
    if(isTRUE(lemma)){
      return(txt[rm.boolean, "lemma"])
    } else{
      return(txt[rm.boolean, "token"])
    }
  } else {
    return(txt[rm.boolean,])
  }
} ## end function tagged.txt.rm.classes()


## function count.sentences()
# expects txt to be an object of class kRp.text,
# and tags a vector with POS tags indicating sentence endings
count.sentences <- function(txt, tags){
  num.sentences <- length(unlist(sapply(tags, function(x){which(txt[,"tag"] == x)}), use.names=FALSE))
  return(num.sentences)
} ## end function count.sentences()


## function value.distribs()
value.distribs <- function(value.vect, omit.missings=TRUE){
  vector.length <- length(unlist(value.vect))
  # some hard to tokenize texts can end up with really strange values here,
  # exceeding the threshold of maxsum=100 for summary() on factors.
  # as a result, the as.numeric(names()) a few lines below will fail because
  # vector.summary has one last entry called "(Other)".
  # so we'll dynamically adjust maxsum to the needed value
  value.factor <- as.factor(value.vect)
  vector.summary <- summary(value.factor, maxsum=length(levels(value.factor)))

  # try to fill up missing values with 0, e.g., found no words with 5 characters
  if(!isTRUE(omit.missings)){
    if(!is.numeric(value.vect)){
      # makes only sense for numeric values
      stop(simpleError("value.distribs(): Impute missings is only valid for numeric vectors!"))
    } else {}
    present.values <- as.numeric(names(vector.summary))
    max.value <- max(present.values)
    missing.values <- c(1:max.value)[!c(1:max.value) %in% present.values]
    append.to.summary <- rep(0, length(missing.values))
    names(append.to.summary) <- as.character(missing.values)
    vector.summary <- c(vector.summary, append.to.summary)
    # finally sort the outcome
    vector.summary <- vector.summary[order(as.numeric(names(vector.summary)))]
  } else {}

  # add cumulative values
  vect.summ.cum <- cumsum(vector.summary)
  # inverse 
  vect.summ.inv <- rbind(vector.summary, vect.summ.cum, vector.length - vect.summ.cum)
  # add percentages
  vect.summ.cum.pct <- rbind(vect.summ.inv, (vect.summ.inv * 100 / vector.length))
  dimnames(vect.summ.cum.pct)[[1]] <- c("num", "cum.sum", "cum.inv", "pct", "cum.pct", "pct.inv")
  return(vect.summ.cum.pct)
} ## end function value.distribs()


## function distrib.from.fixed()
# make distribution matrices from fixed text features
distrib.from.fixed <- function(fixed.vect, all.words, idx="s", unaccounted="x"){
  num.accounted <- fixed.vect[grep(paste0(idx, "[[:digit:]]+"), names(fixed.vect))]
  num.unaccounted <- all.words - sum(num.accounted)
  num.names <- names(num.accounted)

  distrib <- value.distribs(c(unlist(sapply(num.names, function(this.val){
      this.val.num <- fixed.vect[[this.val]]
      this.val.name <- as.numeric(gsub("[^[:digit:]]", "", this.val))
      return(rep(this.val.name, this.val.num))
    })), if(num.unaccounted > 0){rep(unaccounted, num.unaccounted)}))

  return(distrib)
} ## end function distrib.from.fixed()


## function distrib.to.fixed()
# the other way round, to be able to use the desc slot with readability.num()
distrib.to.fixed <- function(distrib, all.values, idx="s"){
  values <- distrib["num",]
  names(values) <- paste0(idx, colnames(distrib))
  values <- c(all=all.values, values)
  return(values)
} ## end function distrib.to.fixed()


## function text.analysis()
# "txt" must be a tagged and commented text object!
text.analysis <- function(txt, lang, corp.rm.class, corp.rm.tag, desc){
  ## global stuff
  # count sentences
  txt.stend.tags <- kRp.POS.tags(lang, list.tags=TRUE, tags="sentc")
  txt.vector <- as.vector(txt[,"token"])

  txt.nopunct <- tagged.txt.rm.classes(txt, lemma=FALSE, lang=lang, corp.rm.class=corp.rm.class, corp.rm.tag=corp.rm.tag)

  txt.lemma <- tagged.txt.rm.classes(txt, lemma=TRUE, lang=lang, corp.rm.class=corp.rm.class, corp.rm.tag=corp.rm.tag)
  txt.lemma.types <- unique(txt.lemma)
  ## end global stuff

  ## statistics on word classes
  word.classes <- kRp.POS.tags(lang, list.classes=TRUE)
  # true/false-matrix for word classes
  class.matrix <- sapply(word.classes, function(x){txt[,"wclass"] == x})
  class.num <- colSums(class.matrix)
  # types lemmata
  class.matrix.types <- sapply(word.classes, function(x){length(unique(txt[,"token"][class.matrix[,x]]))})
  # types lemmatized
  class.matrix.lemma <- sapply(word.classes, function(x){length(unique(txt[,"lemma"][class.matrix[,x]]))})

  # counting classes
  class.analysis <- sapply(word.classes, function(wd.class){
      txt.num.class <- txt[,"wclass"] == wd.class
      assign(wd.class, txt.vector[txt.num.class])
      }
    )

  # length of sentences
  # for this, first we need the text with only words and sentence ending punctuation
  txt.sentc.ends <- txt[txt[,"tag"] %in% kRp.POS.tags(lang=lang, list.tags=TRUE, tags=c("words", "sentc")), ]
  sentence.index <- c(0, which(txt.sentc.ends[,"tag"] %in% txt.stend.tags))
  ## note: if the last sentence didn't end with some punctuation, it's not counted as a sentence at all!
  if(identical(sentence.index, 0)){
    # seems there are no sentences at all
    # we'll take the number of all words as an estimate
    sentence.lengths <- length(txt[txt[,"tag"] %in% kRp.POS.tags(lang=lang, list.tags=TRUE, tags=c("words")), ])
  } else {
    sentence.lengths <- sapply(2:length(sentence.index), function(sntc.ends.here){
        entcs.words <- sentence.index[sntc.ends.here] - sentence.index[sntc.ends.here - 1] - 1
        return(entcs.words)
      })
  }

  ## further descriptives
  num.questions <- sum(txt.vector %in% "?")
  num.exclamats <- sum(txt.vector %in% "!")
  num.semicolon <- sum(txt.vector %in% ";")
  num.colon     <- sum(txt.vector %in% ":")

  add.results <- list(
              all.words=txt.nopunct,
              all.lemmata=txt.lemma.types,
              classes=class.analysis,
              lemmata=length(txt.lemma.types),
              freq.token=class.num,
              freq.types=class.matrix.types,
              freq.lemma=class.matrix.lemma,
              sentc.length=sentence.lengths,
              sentc.distrib=value.distribs(sentence.lengths),
              questions=num.questions,
              exclam=num.exclamats,
              semicolon=num.semicolon,
              colon=num.colon
          )
  results <- append(add.results, desc)
  return(results)
} ## end function text.analysis()


## function word.freq()
# takes singe character strings or vectors and looks up their appearance frequency in corpora data
# "rel" can be used to define the interesting relation:
# "pct", "pmio", "rank.avg", "rank.min", "inDocs" or "idf"
word.freq <- function(txt, corp.freq, rel, zero.NAs=FALSE){
  # some basic sanity checks
  stopifnot(inherits(corp.freq, "kRp.corp.freq"))
  if(!is.character(txt)){
    stop(simpleError("Word frequency analysis can only be run on character data!"))
  } else {}

  if(!all(rel %in% c("pct", "pmio", "log10", "rank.avg", "rank.min", "rank.rel.avg", "rank.rel.min", "inDocs", "idf"))){
    stop(simpleError(paste0("Option \"rel\" must be \"pct\", \"pmio\", \"log10\", \"rank.avg\", \"rank.min\", \"rank.rel.avg\",\n",
      "\"rank.rel.min\", \"inDocs\" or \"idf\"!")))
  } else {}

  corp.index <- match(txt, corp.freq@words$word)
  results <- corp.freq@words[corp.index, rel]

  if(isTRUE(zero.NAs) & any(is.na(results))){
    if(inherits(results, "data.frame")){
      rowsWithNAs <- unique(which(is.na(results), arr.ind=TRUE)[,"row"])
      results[rowsWithNAs,] <- rep(0, ncol(results))
    } else {
      results[is.na(results)] <- 0
    }
  } else {}
  return(results)
} ## end function word.freq()


## function type.freq()
# this function will identify unique types in a tagged text object
# and count how often it appears in the text
# - txt: must be a tagged text object
# - vector.only: if the numbers and letters are not needed, return the vector of types an quit
type.freq <- function(txt, case.sens=TRUE, verbose=FALSE, lemma=FALSE, fail.if.no.lemmas=TRUE, vector.only=FALSE){
  # shall we count all tokens or their lemmas?
  if(isTRUE(lemma)){
    # do a sanity check, are lemmata present?
    if(identical(unique(txt[,"lemma"]), "")){
      if(isTRUE(fail.if.no.lemmas)){
        stop(simpleError(paste0("You asked to use lemmata, but your text object doesn't include any!",
          "\n  This is probably the case because you used tokenize() instead of treetag().")))
      } else {
        return(NULL)
      }
    } else {
      relevant.tokens <- "lemma"
    }
  } else {
    relevant.tokens <- "token"
  }

  all.tokens <- txt[,c(relevant.tokens,"lttr")]
  if(!isTRUE(case.sens)){
    all.tokens[[relevant.tokens]] <- tolower(all.tokens[[relevant.tokens]])
  } else {}
  corp.freq <- unique(all.tokens)
  colnames(corp.freq) <- c("type","lttr")
  if(isTRUE(vector.only)){
    return(corp.freq[["type"]])
  } else {
    corp.freq[["freq"]] <- 0
    num.tokens <- nrow(all.tokens)
    num.types <- nrow(corp.freq)
    if(isTRUE(verbose)){
      type.counter <- 1
    } else {}
    for (tp in seq_along(corp.freq[["type"]])){
      corp.freq[tp, "freq"] <- sum(all.tokens[[relevant.tokens]] %in% corp.freq[tp, "type"], na.rm=TRUE)
      if(isTRUE(verbose)){
        message(paste0("\t", floor(100*type.counter/num.types), "% complete, processing ", relevant.tokens, " ", type.counter, " of ", num.types, ": \"", corp.freq[tp, "type"], "\""))
        message(paste0(" (found ", corp.freq[tp, "freq"], " times in ", num.tokens, " ", relevant.tokens, "s)\n"))
        type.counter <- type.counter + 1
      } else {}
    }
    # order results
    corp.freq <- corp.freq[order(corp.freq[,"freq"], corp.freq[,"lttr"], decreasing=TRUE),]
    rownames(corp.freq) <- NULL
  }
  return(corp.freq)
} ## end function type.freq()


## function create.corp.freq.object()
# this function should be used to create corpus frequency objects
# the idea is to have this one function so that any kind of corpora data
# can be squeezed into the format we want.
# "matrix.freq" needs to be a matrix with three columns:
#     "num" (some ID), "word" (the actual running word form) and "freq" (absolute frequency).
#     optional columns are "lemma", "tag", "wclass", "inDocs" and "idf".
# "df.meta" must be a data.frame with two columns: "meta" (name of meta information) and its "value".
# "dscrpt.meta" must be a data.frame with six columns: "tokens" (old: "words"), "types" (old: "dist.words"),
#   "words.p.sntc", "chars.p.sntc", "chars.p.wform" and "chars.p.word"; if NULL its value is set to an empty default
# "extra.cols" is an optional data.frame with additional columns, e.g. valence data
# "caseSens" determines whether frequency stats should distinguish between letter cases or consolidate otherwise identical tokens
create.corp.freq.object <- function(matrix.freq, num.running.words, df.meta, df.dscrpt.meta,
  matrix.table.bigrams=NULL, matrix.table.cooccur=NULL, extra.cols=NULL, caseSens=TRUE, quiet=FALSE){
  tokenFreq <- as.numeric(matrix.freq[,"freq"])
  if(!isTRUE(caseSens)){
    # first look for tokens that only differ in letter case and
    # replace freq with sum of all occurances
    lowerTokens <- tolower(matrix.freq[,"word"])
#     caseSensTokens <- unique(lowerTokens[duplicated(lowerTokens)])
#     caseSensFreq <- as.list(rep(0, length(caseSensTokens)))
#     names(caseSensFreq) <- caseSensTokens
#     caseSensFreq <- as.environment(caseSensFreq)
    caseSensFreq <- new.env()
    if(!isTRUE(quiet)){
      num.tokens <- length(lowerTokens) # length(caseSensTokens)
      message(paste0("Re-calculating token frequencies (case insensitve, ", num.tokens, " tokens)"))
      # just some feedback, so we know the machine didn't just freeze...
      prgBar <- txtProgressBar(min=0, max=num.tokens, style=3)
    } else {}

    start.with <- new.env()
    start.with[["count"]] <- 1
    # we don't really need this object, sapply updates the environment caseSensFreq
    tmp <- sapply(
      seq_along(tokenFreq),
      function(numToken){
        if(!isTRUE(quiet)){
          # update progress bar
          setTxtProgressBar(prgBar, start.with[["count"]])
        } else {}
        thisLowToken <- lowerTokens[numToken]
        if(is.null(caseSensFreq[[thisLowToken]])){
          caseSensFreq[[thisLowToken]] <- tokenFreq[numToken]
        } else {
          caseSensFreq[[thisLowToken]] <- caseSensFreq[[thisLowToken]] + tokenFreq[numToken]
        }
        start.with[["count"]] <- start.with[["count"]] + 1
      }
    )
    tokenFreq <- unlist(sapply(
      lowerTokens,
      function(thisToken){
        caseSensFreq[[thisToken]]
      },
      USE.NAMES=FALSE
    ))
    if(!isTRUE(quiet)){
      setTxtProgressBar(prgBar, num.tokens)
      close(prgBar)
    } else {}
  } else {}

  # try to work around missing meta information
  if(is.na(num.running.words)){
    num.running.words <- sum(tokenFreq)
  }
  # calculate rank data
  rank.avg <- rank(tokenFreq, ties.method="average")
  rank.min <- rank(tokenFreq, ties.method="min")
  # for better comparability, compute relative ranks
  # can take values between 0 and 100
  rank.rel.avg <- (rank.avg / max(rank.avg)) * 100
  rank.rel.min <- (rank.min / max(rank.min)) * 100

  words.per.mio <- tokenFreq %/% (num.running.words/1000000)
  log10.per.mio <- log10(words.per.mio)
  # correct for lowest frequency words and log10(0), which returns -Inf
  log10.per.mio[log10.per.mio < 0] <- 0

  # check if the optional columns "lemma", "tag", "wclass", "inDocs" and "idf" are present
  if("lemma" %in% colnames(matrix.freq)){
    have.lemma <- matrix.freq[,"lemma"]
  } else {
    have.lemma <- NA
  }
  if("tag" %in% colnames(matrix.freq)){
    have.tag <- matrix.freq[,"tag"]
  } else {
    have.tag <- NA
  }
  if("wclass" %in% colnames(matrix.freq)){
    have.wclass <- matrix.freq[,"wclass"]
  } else {
    have.wclass <- NA
  }
  if("inDocs" %in% colnames(matrix.freq)){
    have.inDocs <- matrix.freq[,"inDocs"]
  } else {
    have.inDocs <- NA
  }
  if("idf" %in% colnames(matrix.freq)){
    have.idf <- matrix.freq[,"idf"]
  } else {
    have.idf <- NA
  }
  df.words <- data.frame(
              num=as.numeric(matrix.freq[,"num"]),
              word=matrix.freq[,"word"],
              lemma=have.lemma,
              tag=have.tag,
              wclass=have.wclass,
              lttr=nchar(matrix.freq[,"word"], type="width", allowNA=TRUE),
              freq=tokenFreq,
              pct=tokenFreq/num.running.words,
              pmio=words.per.mio,
              log10=log10.per.mio,
              rank.avg=rank.avg,
              rank.min=rank.min,
              rank.rel.avg=rank.rel.avg,
              rank.rel.min=rank.rel.min,
              inDocs=have.inDocs,
              idf=have.idf,
              stringsAsFactors=FALSE)
  # add extra columns, if given
  if(!is.null(extra.cols)){
    df.words <- cbind(df.words, extra.cols, stringsAsFactors=FALSE)
  } else {}
  # set missing information to a valid defaults
  if(is.null(df.dscrpt.meta)){
    df.dscrpt.meta <- slot(kRp_corp_freq(), "desc")
  } else {}
  if(is.null(df.meta)){
    df.meta <- slot(kRp_corp_freq(), "meta")
  } else {}
  if(is.null(matrix.table.bigrams)){
    df.table.bigrams <- slot(kRp_corp_freq(), "bigrams")
  } else {
    message("Fetching bigram tokens from data... ", appendLF=FALSE)
    df.table.bigrams <- data.frame(
      token1=df.words[
        sapply(
          as.numeric(matrix.table.bigrams[,"token1"]),
          function(x){
            which(df.words[,"num"] == x)
          }
        ), "word"],
      token2=df.words[
        sapply(
          as.numeric(matrix.table.bigrams[,"token2"]),
          function(x){
            which(df.words[,"num"] == x)
          }
        ), "word"],
      freq=as.numeric(matrix.table.bigrams[,"freq"]),
      sig=as.numeric(matrix.table.bigrams[,"sig"])
    )
    # sort by frequency
    message("sorting... ", appendLF=FALSE)
    df.table.bigrams <- df.table.bigrams[with(df.table.bigrams, order(freq, decreasing=TRUE)),]
    message("done.")
  }
  if(is.null(matrix.table.cooccur)){
    df.table.cooccur <- slot(kRp_corp_freq(), "cooccur")
  } else {
    message("Fetching co-occurrence tokens from data... ", appendLF=FALSE)
    df.table.cooccur <- data.frame(
      token1=df.words[
        sapply(
          as.numeric(matrix.table.cooccur[,"token1"]),
          function(x){
            which(df.words[,"num"] == x)
          }
        ), "word"],
      token2=df.words[
        sapply(
          as.numeric(matrix.table.cooccur[,"token2"]),
          function(x){
            which(df.words[,"num"] == x)
          }
        ), "word"],
      freq=as.numeric(matrix.table.cooccur[,"freq"]),
      sig=as.numeric(matrix.table.cooccur[,"sig"])
    )
    # sort by frequency
    message("sorting... ", appendLF=FALSE)
    df.table.cooccur <- df.table.cooccur[with(df.table.cooccur, order(freq, decreasing=TRUE)),]
    message("done.")
  }
  results <- kRp_corp_freq(
    meta=df.meta,
    words=df.words,
    desc=df.dscrpt.meta,
    bigrams=df.table.bigrams,
    cooccur=df.table.cooccur,
    caseSens=caseSens
  )
  return(results)
} ## end function create.corp.freq.object()


## function noInf.summary()
# helper function to produce characteristics summaries without infinite values
# used in the show methods
# data must be a vector
noInf.summary <- function(data, add.sd=FALSE){
  data[is.infinite(data)] <- NA
  print(summary(data))
  if(isTRUE(add.sd)){
    cat("   SD\n ", round(sd(data, na.rm=TRUE), digits=4), "\n", sep="")
  } else {}
} ## end function noInf.summary()


## function is.supported.lang()
# determins if a language is supported, and returns the correct identifier
is.supported.lang <- function(lang.ident, support="treetag"){
  if(identical(support, "treetag")){
    treetag.supported <- as.list(as.environment(.koRpus.env))[["langSup"]][["kRp.POS.tags"]][["tags"]]
    if(lang.ident %in% names(treetag.supported)){
      res.ident <- lang.ident
    } else {
      stop(simpleError(
        paste0(
          "Unknown tag definition requested: ", lang.ident, "\n",
          "See ?available.koRpus.lang() for a list of supported languages."
        )
      ))
    }
  } else {}

  return(res.ident)
} ## end function is.supported.lang()


## function txt.compress()
# will gzip an object and return file size and compression ratio
# can be used to estimate redundancy on a global level
txt.compress <- function(obj, level=9, ratio=FALSE, in.mem=TRUE){

  if(is.character(obj)){
    txt <- obj
  } else if(inherits(obj, "kRp.text")){
    txt <- pasteText(obj)
  } else {
    stop(simpleError("Cannot compress objects which are neither character nor of a koRpus class!"))
  }

  if(isTRUE(in.mem)){
    zz.gz <- memCompress(txt, "gzip")
    zz.gz.size <- object.size(zz.gz)
    if(isTRUE(ratio)){
      zz.non <- memCompress(txt, "none")
      zz.non.size <- object.size(zz.non)
      ratio <- as.numeric(zz.gz.size / zz.non.size)
    } else {
      zz.non.size <- NA
      ratio <- NA
    }
  } else {
    tmp.path <- tempfile("koRpus.gz")
      if(!dir.create(tmp.path, recursive=TRUE)) stop(simpleError("Compression skipped: Can't create temporary directory!"))
    # if the function is done, remove the tempdir
    on.exit(unlink(tmp.path, recursive=TRUE))

    # (probably) create two connection, one compressed
    zz.gz <- gzfile(file.path(tmp.path, "bloat.gz"), open="w", compression=level)
    writeLines(txt, zz.gz, sep=" ")
    close(zz.gz)
    zz.gz.size <- file.info(file.path(tmp.path, open="bloat.gz"))$size
    if(isTRUE(ratio)){
      zz.non <- file(file.path(tmp.path, "bloat"), open="w")
      writeLines(txt, zz.non, sep=" ")
      close(zz.non)
      zz.non.size <- file.info(file.path(tmp.path, open="bloat"))$size
      ratio <- zz.gz.size / zz.non.size
    } else {
      zz.non.size <- NA
      ratio <- NA
    }
  }

  results <- list(file.size=zz.non.size, gz.size=zz.gz.size, ratio=ratio)

  return(results)
} ## end function txt.compress()


## function read.udhr()
# this function will read the Universal Declaration of Human Rights from text files by
# the UDHR in Unicode project: https://unicode.org/udhr/downloads.html
# txt.path must be a character string pinting to the directory where the bulk files
# were extracted, or to the ZIP file.
read.udhr <- function(txt.path, quiet=TRUE){

  # check if txt.path is a zip file
  if(!as.logical(file_test("-d", txt.path))){
    if(file.exists(txt.path) & grepl("(\\.zip|\\.ZIP)$", txt.path)){
      # ok, seems to be an existing zip file
      # unpack it to a temporary location and alter variable to use that location
      tmp.path <- tempfile("koRpus.UDHR")
      if(!dir.create(tmp.path, recursive=TRUE)) stop(simpleError("UDHR: Can't create temporary directory!"))
      # if the function is done, remove the tempdir
      on.exit(unlink(tmp.path, recursive=TRUE))
      unzip(txt.path, exdir=tmp.path)
      udhr.path <- tmp.path
    } else {
      stop(simpleError(paste("Cannot access UDHR location:", txt.path)))
    }
  } else {
    udhr.path <- txt.path
  }

  #   # requires package XML
  #   udhr.xml <- xmlParse(file.path(udhr.path, "index.xml"))
  #   udhr.list <- xmlSApply(xmlRoot(udhr.xml), xmlAttrs)
  ## since there was no windows package XML for R 2.13, this is a primitive parser which does the job:
  udhr.XML <- readLines(file.path(udhr.path, "index.xml"), warn=FALSE)
  # remove comments
  udhr.XML <- gsub("[[:space:]]*<!--(.*)-->", "", udhr.XML[grep("<udhr ", udhr.XML)])
  # filter out only the interesting parts
  udhr.XML <- gsub("([[:space:]]+<udhr )(.*)/>", "\\2", udhr.XML, perl=TRUE)
  # split vector into a list with ell elements; then we have basically what xmlParse() and xmlSApply() returned
  udhr.XML <- gsub("([[:alnum:]]+)=('[^']+'|'')[[:space:]]+", "\\1=\\2#", udhr.XML, perl=TRUE)
  # as a safety measure, put iso639-3 in quotes
  udhr.XML <- gsub("#iso639-3=", "#\"iso639-3\"=", udhr.XML)
  udhr.XML.list <- strsplit(udhr.XML, split="#")
  udhr.list <- lapply(seq_along(udhr.XML.list), function(cur.entry){eval(parse(text=paste0("c(", paste(udhr.XML.list[[cur.entry]], collapse=", "), ")")))})

  names(udhr.list) <- seq_along(udhr.list)
  # correct for missing values and variables
  udhr.list.corr <- sapply(udhr.list, function(udhr.entry){
      # older index files split up l+v and n+nv, newer have combined f and n elements
      if("f" %in% names(udhr.entry)){
        udhr.entry[["file"]] <- file.path(udhr.path, paste0("udhr_", udhr.entry[["f"]], ".txt"))
        udhr.entry[["name"]] <- udhr.entry[["n"]]
      } else if("l" %in% names(udhr.entry)){
        if(!"v" %in% names(udhr.entry)){
          udhr.entry[["v"]] <- NA
          udhr.entry[["file"]] <- file.path(udhr.path, paste0("udhr_", udhr.entry[["l"]], ".txt"))
        } else {
          udhr.entry[["file"]] <- file.path(udhr.path, paste0("udhr_", udhr.entry[["l"]], "_", udhr.entry[["v"]], ".txt"))
        }
        if(!"nv" %in% names(udhr.entry)){
          udhr.entry[["nv"]] <- NA
          udhr.entry[["name"]] <- udhr.entry[["n"]]
        } else {
          udhr.entry[["name"]] <- paste0(udhr.entry[["n"]], " (",udhr.entry[["nv"]],")")
        }
      } else {
        stop(simpleError("Erm, sorry -- looks like the format if the index.xml file has changed, please file a bug report!"))
      }
      # now try to load the translated text from file and add it to the entry
      udhr.file <- udhr.entry[["file"]]
      if(file.exists(udhr.file)){
        # remove the trailing notice also, because it will keep gzip from
        # optimizing for the actual charset
        udhr.entry[["text"]] <- gsub("^.*udhr. --- ", "", paste(suppressWarnings(scan(udhr.file, what=character(), quiet=quiet)), collapse=" "))
      } else {
        udhr.entry[["text"]] <- NA
      }
      # reorder again, otherwise t() will result in strange outcomes...
      entry.names <- sort(names(udhr.entry))
      return(udhr.entry[entry.names])
    }
  )

  udhr.df.corr <- as.data.frame(t(udhr.list.corr), stringsAsFactors=FALSE)
  # remove entries without a text to compare
  results <- udhr.df.corr[!is.na(udhr.df.corr[,"text"]), ]

  return(results)
} ## end function read.udhr()


## function text.1st.letter()
# changes the first letter of a word to upper or lower case
# if case="change", the present case will be switched to the other
text.1st.letter <- function(word, case){

  results <- sapply(word, function(cur.word){
    word.vector <- unlist(strsplit(cur.word, split=""))

    if(identical(case, "upper")){
      word.vector[1] <- toupper(word.vector[1])
    } else{}
    if(identical(case, "lower")){
      word.vector[1] <- tolower(word.vector[1])
    } else{}
    if(identical(case, "change")){
      if(isTRUE(grepl("([[:lower:]])", word.vector[1]))){
        word.vector[1] <- toupper(word.vector[1])
      } else if(isTRUE(grepl("([[:upper:]])", word.vector[1]))){
        word.vector[1] <- tolower(word.vector[1])
      } else {}
    } else{}

    word.new <- paste(word.vector, collapse="")
    return(word.new)
    }, USE.NAMES=FALSE)

  return(results)
} ## end function text.1st.letter()


## function taggz()
# takes a vector of tokens and adds internal tags
# unicode escapes:
#   <e2><80><99> -> \u2019 right single quotation mark
#   <e2><80><93> -> \u2013 en dash
#' @importFrom data.table data.table :=
taggz <- function(tokens, abbrev=NULL, heur.fix=list(pre=c("\u2019","'"), suf=c("\u2019","'")), ign.comp="", sntc=c(".","!","?",";",":")){
  # make R CMD check happy...
  token <- tag <- NULL
  tagged.text <- data.table(token=tokens, tag="unk.kRp")
  Encoding(tagged.text$token) <- "UTF-8"
  Encoding(tagged.text$tag) <- "UTF-8"
  tagged.text[token == "" & tag == "unk.kRp", tag := "unk"]
  # all letters, assume it's a word
  tagged.text[!grepl(paste("[^\\p{L}\\p{M}",paste(ign.comp, collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := "word.kRp"]
  # all digits, assume it's a number
  tagged.text[!grepl(paste("[^\\p{N}",paste(ign.comp, collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := "no.kRp"]
  # assume it's an abbreviation
  tagged.text[token %in% abbrev & tag == "unk.kRp", tag := "abbr.kRp"]
  # assume it's a sentence ending
  tagged.text[token %in% sntc & tag == "unk.kRp", tag := ".kRp"]
  # all dots
  tagged.text[!grepl("[^.]", token, perl=TRUE) & tag == "unk.kRp", tag := "-kRp"]
  tagged.text[token == "," & tag == "unk.kRp", tag := ",kRp"]
  tagged.text[!grepl(paste("[^\\p{Ps}",paste(ign.comp, collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := "(kRp"]
#       } else if(tkn %in% c("(","{","[")){
#         return(c(token=tkn, tag="(kRp"))
  tagged.text[!grepl(paste("[^\\p{Pe}",paste(ign.comp, collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := ")kRp"]
#       } else if(tkn %in% c(")","}","]")){
#         return(c(token=tkn, tag=")kRp"))
#       } else if(tkn %in% c("-","\u2013")){
#         return(c(token=tkn, tag="-kRp"))
  tagged.text[token %in% c("\"","'","''","`","``","\u2019","\u2019\u2019") & tag == "unk.kRp", tag := "''kRp"]
  tagged.text[!grepl(paste("[^\"\\p{Pi}\\p{Pf}",paste(ign.comp, collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := "''kRp"]
  tagged.text[!grepl(paste("[^\\p{Pd}",paste(ign.comp, collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := "-kRp"]
  # any other punctuation
  tagged.text[!grepl(paste("[^\\p{P}",paste(ign.comp, collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := "-kRp"]
  # simple heuristics for abbreviations
  tagged.text[!grepl(paste("[^\\p{L}\\p{M}.",paste(ign.comp, collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := "abbr.kRp"]
  # simple heuristics for pre- and suffixes
  tagged.text[!grepl(paste("[^\\p{L}\\p{M}\\p{N}",paste(unique(unlist(heur.fix), ign.comp), collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := "word.kRp"]
  # automatic healine or paragraph detection:
  tagged.text[token == "<kRp.h>" & tag == "unk.kRp", tag := "hon.kRp"]
  tagged.text[token == "</kRp.h>" & tag == "unk.kRp", tag := "hoff.kRp"]
  tagged.text[token == "<kRp.p/>" & tag == "unk.kRp", tag := "p.kRp"]

  tagged.text <- as.matrix(tagged.text)
  
  return(tagged.text)
} ## end function taggz()


## function tokenz()
# a simple tokenizer. txt must be a character vector with stuff to tokenize
# will return a vector with one token per element
# 'abbrev' must be a text file encoded in 'encoding', with one abbreviation per line
tokenz <- function(txt, split="[[:space:]]", ign.comp="-", heuristics="abbr", abbrev=NULL,
        encoding="unknown", heur.fix=list(pre=c("\u2019","'"), suf=c("\u2019","'")), tag=FALSE,
        sntc=c(".","!","?",";",":"), detect=c(parag=FALSE, hline=FALSE)){

  ## clean ign.comp for use in regexp
  ign.comp.rex <- ign.comp
  ign.comp.rex[ign.comp.rex %in% "-"] <- "\\-"
  ign.comp.rex <- paste(ign.comp.rex, collapse="")

  ## prep the text
  # if headlines and paragraphs should be autodetected
  if(any(detect)){
    hline <- ifelse("hline" %in% names(detect), detect[["hline"]], FALSE)
    parag <- ifelse("parag" %in% names(detect), detect[["parag"]], FALSE)
    lines.empty <- grepl("^([[:space:]]*)$", txt)
    if(isTRUE(hline)){
      # lines without any sentence ending punctuation are considered headlines
      headlines <-  !lines.empty & !grepl("[.;:!?]", txt) & !grepl(",$", txt)
      txt[headlines] <- paste("<kRp.h>", txt[headlines], "</kRp.h>", sep=" ")
    } else {}
    if(isTRUE(parag)){
      paragraphs <- lines.empty & c(FALSE, !lines.empty[-length(lines.empty)])
      if(isTRUE(hline)){
        # if the previous entry was a headline, don't tag a paragraph as well
        paragraphs <- paragraphs & c(FALSE, !headlines[-length(headlines)])
      } else {}
      txt[paragraphs] <- "<kRp.p/>"
    } else {}
  } else {}

  tk.pre.lst <- unlist(strsplit(txt, split))

  # check for abbreviations to consider
  if(!is.null(abbrev)){
    check.file(abbrev, mode="exist")
    abbrev.vect <- readLines(abbrev, encoding=encoding)
  } else {
    abbrev.vect <- NULL
  }

  # call our own simple tokenizer
  # it should split strings at every non-letter character, with the
  # exception of a defined word-binding (like "-")
  tokenized.text <- as.vector(unlist(sapply(tk.pre.lst, function(tkn){
      # in case of empty elements, skip to the next one
      if(identical(tkn,"")){
        return()
      } else {}
      # if automatic healine or paragraph detection was used, filter their tags
      if(tkn %in% c("<kRp.h>", "</kRp.h>", "<kRp.p/>")){
        return(tkn)
      } else {}
      # if it's all just letters and numbers, leave as is
      if(!grepl("[^\\p{L}\\p{M}\\p{N}]", tkn, perl=TRUE)){
        return(tkn)
      } else {
        # first, split off escaped quotes
        if(grepl("\"", tkn, perl=TRUE)){
          tkn <- gsub("(\")([^\\s])", "\\1 \\2", tkn, perl=TRUE)
          tkn <- gsub("([^\\s])(\")", "\\1 \\2", tkn, perl=TRUE)
        } else {}
        # create a vector for later exclusion if a pre/suffix heuristic was chosen and already applied
        tkn.nonheur <- tkn
        all.fixes <- c()
        # for pre- and suffixes, set number of occurring letters
        pre.max.num <- "" # the empty default reads a "exactly one letter"
        suf.max.num <- "{1,2}"
        # see what we need to check for
        check.suffix <- ifelse(any(c("en", "fr", "suf") %in% heuristics), TRUE, FALSE)
        check.prefix <- ifelse(any(c("fr", "pre") %in% heuristics), TRUE, FALSE)
        if("fr" %in% heuristics){
          # in french longer suffixes like "'elle" are common
          suf.max.num <- "+"
        } else {}
        # probably split off french prefixes like l'animal
        if(isTRUE(check.prefix)){
          stopifnot(length(heur.fix$pre) > 0)
          heur.prefix <- paste0(heur.fix$pre, collapse="")
          # also take care of cases where there's non-letter stuff before the prefix or after the prefixed word
          tkn <- gsub(paste0("(^[\\p{Z}\\p{S}\\p{P}]*)([\\p{L}\\p{M}\\p{N}]", pre.max.num, "[", heur.prefix, "])([\\p{L}\\p{M}\\p{N}]+)([\\p{Z}\\p{S}\\p{P}]*$)"),
            "\\1 \\2 \\3 \\4", tkn, perl=TRUE)
          all.fixes <- unique(c(all.fixes, heur.prefix))
          tkn.nonheur <- gsub(paste0("[",all.fixes,"]", collapse=""), "", tkn)
        } else {}
        # the same for possessive 's and the like
        if(isTRUE(check.suffix)){
          stopifnot(length(heur.fix$suf) > 0)
          heur.suffix <- paste0(heur.fix$suf, collapse="")
          # also take care of cases where there's non-letter stuff before the suffix or after the suffixed word
          tkn <- gsub(paste0("(^[\\p{Z}\\p{S}\\p{P}]*)([\\p{L}\\p{M}\\p{N}]+)([", heur.suffix, "][\\p{L}\\p{M}\\p{N}]", suf.max.num, ")([\\p{Z}\\p{S}\\p{P}]*$)"),
            "\\1 \\2 \\3 \\4", tkn, perl=TRUE)
          all.fixes <- unique(c(all.fixes, heur.suffix))
          tkn.nonheur <- gsub(paste0("[",all.fixes,"]", collapse=""), "", tkn)
        } else {}
        # check for possible abbreviations
        if(is.null(abbrev.vect) | !tkn %in% abbrev.vect){
          # do the heuristics
          # currently, single letters followed by a dot will be taken for an abbreviated name;
          # the former implementation demanded at least two appearances of that:
          #if("abbr" %in% heuristics & grepl("(\\p{L}\\p{M}*\\.){2,}", tkn, perl=TRUE)){
          if("abbr" %in% heuristics & grepl("(^\\p{P}*\\p{L}\\p{M}*\\.)", tkn, perl=TRUE)){
            # separate closing/opening brackets and dashes
            tkn <- gsub("([\\p{Pe}\\p{Ps}\\p{Pd}])([\\p{L}\\p{M}])", "\\1 \\2", tkn, perl=TRUE)
            tkn <- gsub("(\\.)([^\\p{L}\\p{M}])", "\\1 \\2", tkn, perl=TRUE)
          # take care of probably already applied prefix/suffix heuristics here:
          } else if(grepl("([\\p{L}\\p{M}\\p{N}]+)([^\\p{L}\\p{M}\\p{N}\\s]+)|([^\\p{L}\\p{M}\\p{N}\\s]+)([\\p{L}\\p{M}\\p{N}]+)", tkn.nonheur, perl=TRUE)){
            # this should be some other punctuation or strange stuff...
            tkn <- gsub(paste0("([^\\p{L}\\p{M}\\p{N}",ign.comp.rex,"])([\\p{L}\\p{M}\\p{N}])"), "\\1 \\2", tkn, perl=TRUE)
            tkn <- gsub(paste0("([\\p{L}\\p{M}\\p{N}])([^\\p{L}\\p{M}\\p{N}",ign.comp.rex,"])"), "\\1 \\2", tkn, perl=TRUE)
          } else {}
          # is there clusters of undefined nonword stuff left?
          if(grepl("([^\\p{L}\\p{M}\\p{N}\\s]{2,})", tkn, perl=TRUE)){
            # as long as it's not dots:
            tkn <- gsub(paste0("([^\\p{Zs}])([^\\p{L}\\p{M}\\p{N}.",ign.comp.rex,"\\p{Zs}])"), "\\1 \\2", tkn, perl=TRUE)
            tkn <- gsub(paste0("([^\\p{L}\\p{M}\\p{N}.",ign.comp.rex,"\\p{Zs}])([^\\p{Zs}])"), "\\1 \\2", tkn, perl=TRUE)
            # keep "..." intact
            tkn <- gsub("([^\\p{L}\\p{M}\\p{N}.\\s])([.])", "\\1 \\2", tkn, perl=TRUE)
          } else {}
        } else {}
        new.tkn <- unlist(strsplit(tkn, " "))
        # remove empty elements
        new.tkn <- new.tkn[!new.tkn %in% ""]
        return(new.tkn)
      }
    }, USE.NAMES=FALSE)))

  if(isTRUE(tag)){
    tokenized.text <- taggz(tokenized.text, abbrev=abbrev.vect, heur.fix=heur.fix, sntc=sntc, ign.comp=ign.comp)
  } else {}

  return(tokenized.text)
} ## end function tokenz()


## function queryList()
queryList <- function(obj, var, query, rel, as.df, ignore.case, perl, regexp_var){
  this.query <- query[[1]]
  this.query.vars <- names(this.query)
  this.q.var <- this.query.vars[[1]]
  this.q.query <- this.query[[1]]
  this.q.rel <- ifelse("rel" %in% this.query.vars, this.query[["rel"]], rel)
  this.q.ignore.case <- ifelse("ignore.case" %in% this.query.vars, this.query[["ignore.case"]], ignore.case)
  this.q.perl <- ifelse("perl" %in% this.query.vars, this.query[["perl"]], perl)
  if(length(query) == 1){
    obj <- query(obj=obj, var=this.q.var, query=this.q.query, rel=this.q.rel, as.df=as.df, ignore.case=this.q.ignore.case, perl=this.q.perl, regexp_var=regexp_var)
  } else {
    remaining.queries <- query[-1]
    remaining.obj <- query(obj=obj, var=this.q.var, query=this.q.query, rel=this.q.rel, as.df=FALSE, ignore.case=this.q.ignore.case, perl=this.q.perl, regexp_var=regexp_var)
    obj <- query(obj=remaining.obj, var=var, query=remaining.queries, rel=rel, as.df=as.df, ignore.case=ignore.case, perl=perl, regexp_var=regexp_var)
  }
  return(obj)
} ## end function queryList()


## function headLine()
# takes text and builds a border around it
headLine <- function(txt, level=1){
  if(identical(level, 1)){
    headlineTxt <- paste0("## ", txt, " ##")
    headlineLine <- paste(rep("#", nchar(headlineTxt, type="width")), collapse="")
    headlineFull <- paste0(headlineLine, "\n", headlineTxt, "\n", headlineLine)
  } else if(identical(level, 2)){
    headlineLine <- paste(rep("=", nchar(txt, type="width")), collapse="")
    headlineFull <- paste0(txt, "\n", headlineLine)
  } else {
    headlineLine <- paste(rep("-", nchar(txt, type="width")), collapse="")
    headlineFull <- paste0(txt, "\n", headlineLine)
  }
  return(headlineFull)
} ## end function headLine()


## function matching.lang()
# helper function to match language definitions,
# called by treetag()
matching.lang <- function(lang, lang.preset){
  if(!identical(lang, lang.preset)){
    warning(
      "Language \"",lang,"\" doesn't match the preset \"", lang.preset,"\". If you run into errors, you have been warned!",
      call.=FALSE
    )
  } else {}
}
## end function matching.lang()


## function paste.tokenized.text()
paste.tokenized.text <- function(txt){
  # put all text together
  all.text <- paste(txt, collapse=" ")
  # remove superfluous spaces
  all.text <- gsub("([[:space:]]{1})([\\(\\[\\{])([[:space:]]{1}|$)", " \\2", all.text, perl=TRUE)
  all.text <- gsub("([[:space:]]{1})([\\)\\]\\}])([[:space:]]{1}|$)", "\\2 ", all.text, perl=TRUE)
  all.text <- gsub("([[:space:]]{1})([,;.:])([[:space:]]{1}|$)", "\\2 ", all.text, perl=TRUE)
}
## end function paste.tokenized.text()


## function checkLangPreset()
# checks if a given language preset is defined at all, and either returns TRUE/error or the full preset definition
checkLangPreset <- function(preset, returnPresetDefinition=TRUE){
  # koRpus dropped support for non-UTF-8 presets and renamed former presets omitting the "-utf8" suffix
  # to not break compatibility, we'll just gracefully remove the suffix
  if(grepl("utf8", preset)){
    preset <- gsub("-utf8$", "", preset)
    warning(paste0("UTF-8 is now the default encoding, please rename your preset from \"", preset, "-utf8\" into just \"", preset, "\"!"), call.=FALSE)
  } else {}
  preset_definition <- as.list(as.environment(.koRpus.env))[["langSup"]][["treetag"]][["presets"]][[preset]]
  if(is.null(preset_definition)){
    error_prefix <- "Manual TreeTagger configuration:\n  "
    # is a language support package installed but not loaded?
    # this check could be limited to two letter patterns,
    # but let's keep it open for now and see how it works out for the users
    lang_pckg_name <- paste0("koRpus.lang.", preset)
    preset_status <- check_lang_packages(available=TRUE, pattern=paste0("^", lang_pckg_name))
    if(lang_pckg_name %in% names(preset_status)){
      lang_pckg <- preset_status[[lang_pckg_name]]
      if(isTRUE(lang_pckg[["loaded"]])){
        stop(simpleError(paste0(error_prefix, "There appears to be a loaded language package \"", lang_pckg_name, "\",\n  yet the preset \"", preset, "\" is not available. Please check your installation!")))
      } else if(isTRUE(lang_pckg[["installed"]])){
        stop(simpleError(paste0(error_prefix, "The preset \"", preset, "\" is currently not available.\n  Did you forget to load the respective language package?\n  Try library(\"", lang_pckg_name, "\") instead of library(\"koRpus\").")))
      } else if(isTRUE(lang_pckg[["available"]])){
        stop(simpleError(paste0(error_prefix, "The preset \"", preset, "\" is currently not available.\n  Did you forget to install and load the respective language package?\n  Try install.koRpus.lang(\"", preset, "\") and then library(\"", lang_pckg_name, "\") instead of library(\"koRpus\").")))
      } else {
        stop(simpleError(paste0(error_prefix, "\"", preset, "\" is not a valid preset!")))
      }
    } else {
      stop(simpleError(paste0(error_prefix, "\"", preset, "\" is not a valid preset!")))
    }
  } else {
    if(isTRUE(returnPresetDefinition)){
      return(preset_definition)
    } else {
      return(TRUE)
    }
  }
}
## end function checkLangPreset()


## function checkTTOptions()
# this helper function does some basic validity checks on provided TT.options
# if all goes well, returns a named list with valid settings
# if manual.config=FALSE returns an empty list because options are omitted anyway
checkTTOptions <- function(TT.options, manual.config, TT.tknz=TRUE){
  result <- list()
  if(!is.null(TT.options) & !is.list(TT.options)){
    warning("You provided \"TT.options\", but not as a list!")
  } else {}
  optNames <- names(TT.options)

  if(isTRUE(manual.config)){
    # basic check for valid element names
    validOptions <- c(
      "path",
      "preset",
      "tokenizer",
      "tknz.opts",
      "pre.tagger",
      "tagger",
      "abbrev",
      "params",
      "lexicon",
      "lookup",
      "filter",
      "no.unknown",
      "splitter",
      "splitter.opts"
    )
    undefined.options <- !optNames %in% validOptions
    if(any(undefined.options)){
      stop(simpleError(paste0(
        "You used undefined names in TT.options:\n  \"",
        paste0(optNames[undefined.options], collapse="\", \""),
        "\""
      )))
    } else {}

    if(!"path" %in% optNames){
      stop(simpleError("Manual TreeTagger configuration demanded, but not even a path was defined!"))
    } else {
      # specify basic paths
      result[["TT.path"]] <- TT.options[["path"]]
      result[["TT.bin"]] <- file.path(result[["TT.path"]],"bin")
      result[["TT.cmd"]] <- file.path(result[["TT.path"]],"cmd")
      result[["TT.lib"]] <- file.path(result[["TT.path"]],"lib")
      # check if this is really a TreeTagger root directory
      sapply(c(result[["TT.bin"]], result[["TT.cmd"]], result[["TT.lib"]]), function(chk.dir){check.file(chk.dir, mode="dir")})
    }

    # basic options, cannot be toyed with
    result[["TT.opts"]] <- "-token -lemma -sgml -pt-with-lemma -quiet"
    # allow some dedicated options to be set without jeopardizing the output format
    if(!is.null(TT.options[["no.unknown"]])){
      result[["TT.opts"]] <- ifelse(
        isTRUE(TT.options[["no.unknown"]]),
        paste0(result[["TT.opts"]], " -no-unknown"),
        result[["TT.opts"]]
      )
    } else {}

    if(!is.null(TT.options[["preset"]])){
      result[["preset"]] <- checkLangPreset(preset=TT.options[["preset"]])
      if(isTRUE(grepl("tree-tagger-[[:alpha:].]+$|tag-[[:alpha:].]+$|*.bat$", tolower(TT.options[["tagger"]])))){
        # sometimes users try to combine TreeTagger's batch files with presets, which is doomed to fail
        stop(simpleError("If you're using a language preset, you must not set TreeTagger's batch files as 'tagger'! "))
      } else {}
    } else {
      # if no preset was defined, we need some more information
      if(isTRUE(TT.tknz)){
        needed.options <- c("tokenizer", "tagger", "params")
      } else {
        needed.options <- c("tagger", "params")
      }
      missing.options <- !needed.options %in% optNames
      if(any(missing.options)){
        stop(simpleError(paste0(
          "Manual TreeTagger configuration demanded, but not enough optinons given!\n  Missing options: \"",
          paste0(needed.options[missing.options], collapse="\", \""),
          "\""
        )))
      } else {}
    }
  } else {}

  return(result)
} ## end function checkTTOptions()


## function winPath()
# all of sudden, the constructions of file paths stopped working for some windows users,
# so we're forced to do something really, really ugly and replace all R-like "/"
# file separators with the windows-like "\\" manually.
winPath <- function(path){
  return(gsub("/", "\\\\", path))
}
# just for the record: i really *hate* windows!
## end function winPath()


## function check_lang_packages()
# checks what koRpus.lang.* packages are currently installed or loaded
# returns a named list with a list for each installed package, providing
# entries named "available", "installed", "loaded", and "title"
# availabe: also check for all available packages in 'repos'
# available.only: omit all installed packages which cannot be found in 'repos'
#' @importFrom utils available.packages packageDescription
check_lang_packages <- function(
  available=FALSE,
  repos="https://undocumeantit.github.io/repos/l10n/",
  available.only=FALSE,
  pattern="^koRpus.lang.*"
){
  ### this function should be kept close to identical to the respective function
  ### in the 'sylly' package, except for the pattern
  result <- list()
  if(isTRUE(available)){
    available_packages <- utils::available.packages(repos=repos)
    available_koRpus_lang <- grepl(pattern, available_packages[,"Package"])
    supported_lang <- unique(available_packages[available_koRpus_lang,"Package"])
  } else {
    available_koRpus_lang <- FALSE
    supported_lang <- NULL
  }

  loaded_packages <- loadedNamespaces()
  loaded_koRpus_lang <- grepl(pattern, loaded_packages)
  installed_packages <- unique(dir(.libPaths()))
  installed_koRpus_lang <- grepl(pattern, installed_packages)

  have_koRpus_lang <- any(installed_koRpus_lang, available_koRpus_lang)

  if(isTRUE(have_koRpus_lang)){
    if(isTRUE(available.only)){
      all_packages <- supported_lang
    } else {
      all_packages <- unique(c(installed_packages[installed_koRpus_lang], supported_lang))
    }
    for (this_package in all_packages){
      result[[this_package]] <- list(available=NA, installed=FALSE, loaded=FALSE, title="(unknown)")
      if(all(isTRUE(available), this_package %in% supported_lang)){
        result[[this_package]][["available"]] <- TRUE
      } else {}
      if(this_package %in% unique(installed_packages[installed_koRpus_lang])){
        result[[this_package]][["installed"]] <- TRUE
        this_package_index <- which.min(!installed_packages %in% this_package)
        result[[this_package]][["title"]] <- utils::packageDescription(installed_packages[this_package_index])[["Title"]]
      } else {}
      if(this_package %in% unique(loaded_packages[loaded_koRpus_lang])){
        result[[this_package]][["loaded"]] <- TRUE
      } else {}
    }
  } else {}
  
  return(result)
} ## end function check_lang_packages()


## function check_toggle_utf8()
# used in treetag() to work around inconsistent naming of parameter
# and other files over the releases; some have "utf8-" or "-utf8" in their name,
# both earlier and later versions don't. some use the ".txt" extension, later versions
# don't. so we'll first try with these pre- and suffixes, then without, and use
# whatever is found first. returns either a validated path or throws an error.
#
# - file_utf8: file to check for existance, the variant including "utf8" in its name
# - dir: full path to expected file (directory only)
# - optional: if TRUE doesn't return an error if no file was found return the path as given;
#     further checks are done elsewhere, we're not handling missing files here, just
#     possible alternatives to defaults!
check_toggle_utf8 <- function(file_utf8, dir=NA, optional=FALSE){
  if(any(is.null(file_utf8), is.na(file_utf8))){
    return(file_utf8)
  } else {
    if(is.na(dir)){
      dir <- dirname(normalizePath(file_utf8, mustWork=FALSE))
      file_utf8 <- basename(file_utf8)
    } else {
      dir <- normalizePath(dir, mustWork=FALSE)
    }

    check.file(filename=dir, mode="dir", stopOnFail=TRUE)

    if(isTRUE(grepl("utf8", file_utf8))){
      all_possible_files <- c(file_utf8, gsub("-utf8", "", gsub("^utf8-", "", file_utf8)))
    } else {
      all_possible_files <- file_utf8
    }
    if(isTRUE(grepl("\\.txt$", file_utf8))){
      all_possible_files <- c(all_possible_files, gsub("\\.txt$", "", all_possible_files))
    } else {}

    all_possible_files <- file.path(dir, all_possible_files)
    all_files_found <- sapply(
      all_possible_files,
      check.file,
      mode="exist",
      stopOnFail=FALSE
    )

    if(any(all_files_found)){
      return(all_possible_files[all_files_found][1])
    } else {
      if(isTRUE(optional)){
        return(file_utf8)
      } else {
        stop(simpleError(
          paste0(
            "None of the following files were found, please check your TreeTagger installation!\n ",
            paste0(all_possible_files, collapse="\n ")
          )
        ))
      }
    }
  }
} ## end function check_toggle_utf8()


## function check_doc_id()
check_doc_id <- function(
    doc_id,
    default,
    quiet=TRUE
){
    warn <- FALSE
    warn_class <- "unknown"
    if(missing(doc_id)){
        warn <- TRUE
        warn_class <- "missing"
        doc_id <- NA
    } else {}
    if(!is.character(doc_id)){
        if(!isTRUE(warn)){
            warn <- TRUE
            warn_class <- class(doc_id)
        } else {}
        if(any(is.null(doc_id), is.logical(doc_id))){
            if(missing(default)){
                stop(simpleError("No \"doc_id\" given and no default value found!"))
            } else {}
            doc_id <- gsub("[^[:alnum:]_\\\\.-]+", "", default)
        } else {
            doc_id <- as.character(doc_id)
        }
    } else {}
    if(all(!isTRUE(quiet), warn)){
        warning(paste0("Fixed \"doc_id\", was ", warn_class, "!"), call.=FALSE)
    } else {}
    return(doc_id)
} ## function check_doc_id()
Any scripts or data that you put into this service are public.
koRpus documentation built on May 18, 2021, 1:13 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
koRpus
Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

R/koRpus-internal.R
In koRpus: Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

Try the koRpus package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

koRpus Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

R/koRpus-internal.R In koRpus: Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

Try the koRpus package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

koRpus
Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity

R/koRpus-internal.R
In koRpus: Text Analysis with Emphasis on POS Tagging, Readability, and Lexical Diversity