R/koRpus-internal.R

Defines functions check_doc_id check_toggle_utf8 check_lang_packages winPath checkTTOptions checkLangPreset paste.tokenized.text matching.lang headLine queryList tokenz taggz text.1st.letter read.udhr txt.compress is.supported.lang noInf.summary create.corp.freq.object type.freq word.freq text.analysis distrib.to.fixed distrib.from.fixed value.distribs count.sentences tagged.txt.rm.classes indexSentenceDoc stopAndStem dumpTextToTempfile treetag.com explain_tags validate_tags language.setting clean.text basic.tagged.descriptives basic.text.descriptives tag.kRp.txt check.file

# Copyright 2010-2021 Meik Michalke <meik.michalke@hhu.de>
#
# This file is part of the R package koRpus.
#
# koRpus is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# koRpus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with koRpus.  If not, see <http://www.gnu.org/licenses/>.


# these are internal functions that are being called by some of the methods of koRpus
# they are not exported, hence not to be called by users themselves
# and are therefore only documented by the comments in this file.

## function check.file()
# helper function for file checks
check.file <- function(filename, mode="exist", stopOnFail=TRUE){

  ret.value <- FALSE

  if(any(is.na(filename), is.null(filename))){
    return(FALSE)
  } else {
    if(identical(mode, "exist") | identical(mode, "exec")){
      if(as.logical(file_test("-f", filename))){
        ret.value <- TRUE
      } else {
        if(isTRUE(stopOnFail)){
          stop(simpleError(paste("Specified file cannot be found:\n", filename)))
        } else {}
        ret.value <- FALSE
      }
    } else {}

    if(identical(mode, "exec")){
      if(as.logical(file_test("-x", filename))){
        ret.value <- TRUE
      } else {
        if(isTRUE(stopOnFail)){
          stop(simpleError(paste("Specified file cannot be executed:\n", filename)))
        } else {}
        ret.value <- FALSE
      }
    } else {}

    if(identical(mode, "dir")){
      if(as.logical(file_test("-d", filename))){
        ret.value <- TRUE
      } else {
        if(isTRUE(stopOnFail)){
          stop(simpleError(paste("Specified directory cannot be found:\n", filename)))
        } else {}
        ret.value <- FALSE
      }
    } else {}
  }

  return(ret.value)
} ## end function check.file()


## function tag.kRp.txt()
# this function takes normal text OR objects of koRpus classes which carry the
# original information of the analyzed text somewhere, and
# tries to return a valid object of class kRp.text instead
# '...' will be passed through to treetag() or tokenize()
tag.kRp.txt <- function(txt, tagger=NULL, lang, objects.only=TRUE, ...){

  if(inherits(txt, "kRp.text")){
    return(as(txt, "kRp.text"))
  } else {}

  if(isTRUE(objects.only)){
    stop(simpleError(paste("Not a valid class for tag.kRp.txt():", class(txt)[1])))
  } else {
    if(is.character(txt)){
      # set the language definition
      if(is.null(lang)){
        lang <- get.kRp.env(lang=TRUE)
      } else {
        ## TODO: add some validation here
      }
      internal.tokenizer <- FALSE
      # set a fallback if tagger isn't set but objects.only=FALSE
      if(is.null(tagger)){
        if(is.null(get.kRp.env(TT.cmd=TRUE, errorIfUnset=FALSE))){
          message("No TreeTagger command specified. Using tokenize() as fallback")
          internal.tokenizer <- TRUE
        } else {
          tagger <- "kRp.env"
        }
      } else {}
      if(identical(tagger, "kRp.env")){
        if(identical(get.kRp.env(TT.cmd=TRUE), "tokenize")){
          internal.tokenizer <- TRUE
        } else {}
      } else {}
      if(identical(tagger, "tokenize") | isTRUE(internal.tokenizer)){
        tagged.txt <- tokenize(txt, tag=TRUE, lang=lang, ...)
      } else {
        tagged.txt <- treetag(txt, treetagger=tagger, lang=lang, ...)
      }
      return(tagged.txt)
    } else {
      stop(simpleError("Text object is neither of class kRp.text nor character!"))
    }
  }
} ## end function tag.kRp.txt()


## function basic.text.descriptives()
# txt must be a character vector
basic.text.descriptives <- function(txt){
  # check for newline characters which might cause trouble
  if(any(grepl("\n", txt))){
      txt <- unlist(strsplit(txt, "\n"))
  } else {}
  # number of characters, including spaces and punctuation
  # the following vector counts chars per line
  vct_all_chars <- nchar(txt, type="width")
  num_lines <- length(vct_all_chars)
  all_chars_no_newline <- sum(vct_all_chars)
  all_chars_incl_newline <- all_chars_no_newline + num_lines

  # split txt into individual characters
  # note this mighthave a different length than vct_all_chars + num_lines
  # because of grapheme clusters, see ChangeLog for 0.11-5
  txt_all_split <- unlist(strsplit(txt, ""))
  spaces <- grepl("[[:space:]]", txt_all_split)
  num_all_spaces <- sum(spaces)
  num_normalized_spaces <- sum(rle(spaces)[["values"]])
  # count without any spaces
  nospace_chars <- all_chars_no_newline - num_all_spaces
  # count punctuation
  num_punct <- sum(grepl("[^\\p{L}\\p{M}\\p{N}[:space:]]", txt_all_split, perl=TRUE))
  # count digits
  num_digits <- sum(grepl("[\\p{N}]", txt_all_split, perl=TRUE))
  # no spaces, punctuation or digits
  only_letters <- nospace_chars - num_punct - num_digits

  # count each cluster of spaces as only one space
  onespace_chars <- nchar(
    gsub("[[:space:]]{2,}", " ", paste(txt, collapse=" ")),
    type="width"
  )

  results <- list(
    all.chars=all_chars_incl_newline,
    lines=num_lines,
    normalized.space=onespace_chars,
    chars.no.space=nospace_chars,
    punct=num_punct,
    digits=num_digits,
    letters=only_letters
  )
  return(results)
} ## end function basic.text.descriptives()


## function basic.tagged.descriptives()
# txt must be an object of class kRp.text
#  - if "txt.vector" is given alone, an all new desc slot is prepared
#  - if "desc" is given alone, some of its content is being updated by examining "txt"
# using "txt.vector" for re-calculations of lines, all.chars or normalized.space is most definitely a bad idea
# because the vector lost all information on spaces between tokens or newlines. therefore:
#  - if *both* "txt.vector" and "desc" are provided and "update.desc=TRUE", as is done by textTransform(),
#    the old value of lines is left untouched, and all.chars and normalized.space are adjusted according to
#    the difference between the old and new value of chars.no.space
#' @include 02_method_filterByClass.R
basic.tagged.descriptives <- function(txt, lang=NULL, desc=NULL, txt.vector=NULL, update.desc=FALSE, doc_id=NA){
  extended_update <- FALSE
  
  if(is.null(lang)){
    lang <- language(txt)
  } else {}

  if(all(!is.null(txt.vector), !is.null(desc), isTRUE(update.desc))){
    desc.old <- desc
    desc <- basic.text.descriptives(txt.vector)
    extended_update <- TRUE
  } else if(all(!is.null(txt.vector), is.null(desc))){
    # create desc if not present
    desc <- basic.text.descriptives(txt.vector)
  } else {}

  # count sentences
  txt.stend.tags <- kRp.POS.tags(lang, list.tags=TRUE, tags="sentc")
  txt.stend <- count.sentences(taggedText(txt), txt.stend.tags)
  # count words
  txt.nopunct <- filterByClass(txt, corp.rm.class="nonpunct", corp.rm.tag=c(), as.vector=FALSE, update.desc=NULL)
  num.words <- nrow(taggedText(txt.nopunct))
  avg.sentc.length <- num.words / txt.stend
  # character distribution
  char.distrib <- value.distribs(taggedText(txt)[["lttr"]], omit.missings=FALSE)
  lttr.distrib <- value.distribs(taggedText(txt.nopunct)[["lttr"]], omit.missings=FALSE)

  # txt.desc$letters had all digits removed
  # we'll use these numbers as they are usually more exact than relying on correct tokenization
   if("letters.only" %in% names(desc)){
     num.letters <- desc[["letters.only"]] + desc[["digits"]]
   } else {
    num.letters <- desc[["letters"]] + desc[["digits"]]
    # for readability calculations
    desc[["letters.only"]] <- desc[["letters"]]
    desc[["letters"]] <- distrib.to.fixed(lttr.distrib, all.values=num.letters, idx="l")
   }
  avg.word.length <- num.letters / num.words

  if(isTRUE(update.desc)){
    desc[["char.distrib"]] <- char.distrib
    desc[["lttr.distrib"]] <- lttr.distrib
    desc[["words"]] <- num.words
    desc[["sentences"]] <- txt.stend
    desc[["avg.sentc.length"]] <- avg.sentc.length
    desc[["avg.word.length"]] <- avg.word.length
    if(isTRUE(extended_update)){
      desc[["lines"]] <- desc.old[["lines"]]
      chars.diff <- desc[["chars.no.space"]] - desc.old[["chars.no.space"]]
      desc[["all.chars"]] <- desc.old[["all.chars"]] + chars.diff
      desc[["normalized.space"]] <- desc.old[["normalized.space"]] + chars.diff
    } else {}
    results <- desc
  } else {
    results <- append(
      desc,
      list(
        char.distrib=char.distrib,
        lttr.distrib=lttr.distrib,
        words=num.words,
        sentences=txt.stend,
        avg.sentc.length=avg.sentc.length,
        avg.word.length=avg.word.length
      )
    )
  }
  results[["doc_id"]] <- doc_id
  return(results)
} ## end function basic.tagged.descriptives()


## function clean.text()
# takes a character vector and a named list. it replaces each occurance
# of the list names by ist value and returns the changed vector
# e.g., use list("\"  "="\"") to remove two spaces behind double quotes
clean.text <- function(txt.vct, from.to=NULL, perl=FALSE){
  if(is.null(from.to)){
    return(txt.vct)
  } else {}
  stopifnot(is.character(txt.vct))
  stopifnot(is.list(from.to))
  for (idx in seq_along(from.to)){
      from <- names(from.to)[[idx]]
      to <- from.to[[idx]]
      txt.vct <- gsub(from, to, txt.vct, perl=perl)
      rm("from", "to")
    }
  return(txt.vct)
} ## end function clean.text()


## function language.setting()
language.setting <- function(tagged.object, force.lang){
  stopifnot(inherits(tagged.object, "kRp.text"))
  if(is.null(force.lang)){
    lang <- tagged.object@lang
  } else {
    lang <- is.supported.lang(lang.ident=force.lang)
  }
  return(lang)
} ## end function language.setting()


## function validate_tags()
# takes a character vector of POS tags and a language identifier
# checks all tags for validity
validate_tags <- function(tags, lang){
  # get all valid tags
  tag.class.def <- kRp.POS.tags(lang)

  # only proceed if all tag values are valid
  all.found.tags <- unique(tags)
  invalid.found.tags <- all.found.tags[!all.found.tags %in% tag.class.def[,"tag"]]
  if(length(invalid.found.tags) > 0){
    warning(paste0("Invalid tag(s) found: ", paste(invalid.found.tags, collapse = ", "),
      "\n  This is probably due to a missing tag in kRp.POS.tags() and",
      "\n  needs to be fixed. It would be nice if you could forward the",
      "\n  above warning dump as a bug report to the package maintaner!\n"), 
      call.=FALSE
    )
  } else {}
  # return a logical vector
  result <- !tags %in% invalid.found.tags
  return(result)
}
## end function validate_tags()


## function explain_tags()
# takes a character vector of POS tags and a language identifier
# returns either an equivalent character vector with explainations of each tag,
# or a matrix if 'cols' is > 1
explain_tags <- function(tags, lang, cols=c("wclass","desc"), valid=rep(TRUE, length(tags))){
  # get all valid tags
  tag.class.def <- kRp.POS.tags(lang)
  tags_explained <- sapply(
    seq_along(tags),
    function(tagNum){
      if(isTRUE(valid[tagNum])){
        return(tag.class.def[tag.class.def[,"tag"] == tags[tagNum], cols])
      } else {
        return(c(wclass="unknown",desc="Unknown (kRp internal)")[cols])
      }
    },
    USE.NAMES=FALSE
  )
  if(length(cols) > 1){
   tags_explained <- as.matrix(t(tags_explained))
  } else {
   tags_explained <- as.character(tags_explained)
  }
  return(tags_explained)
}
## end function explain_tags()


## function treetag.com()
treetag.com <- function(tagged.text, lang, add.desc=TRUE){
  # initialize empty target data.frame to define the order of columns
  newDf <- init.kRp.text.df(rows=nrow(tagged.text))
  newDf[,c("token","lemma")] <- tagged.text[,c("token","lemma")]

  valid_tags <- validate_tags(tags=tagged.text[,"tag"], lang=lang)
  # get all valid tags
  tag.class.def <- kRp.POS.tags(lang)
 
  # make tag a factor with all possible tags for this language as levels
  newDf[["tag"]] <- factor(
    tagged.text[,"tag"],
    # keep invalid tags for debugging
    levels=unique(c(tag.class.def[,"tag"], tagged.text[!valid_tags,"tag"]))
  )
  # count number of letters, add column "lttr"
  newDf[["lttr"]] <- as.integer(nchar(tagged.text[,"token"], type="width"))

  # add further columns "wclass" and "desc"
  if(isTRUE(add.desc)){
    tagsExplained <- explain_tags(tags=tagged.text[,"tag"], lang=lang, cols=c("wclass","desc"), valid=valid_tags)
    tagsExplained.wclass <- factor(
      tagsExplained[,"wclass"],
      levels=unique(tag.class.def[,"wclass"])
    )
    tagsExplained.desc <- factor(
      tagsExplained[,"desc"],
      levels=unique(tag.class.def[,"desc"])
    )
  } else {
    tagsExplained.wclass <- factor(
      explain_tags(tags=tagged.text[,"tag"], lang=lang, cols="wclass", valid=valid_tags),
      levels=unique(tag.class.def[,"wclass"])
    )
    tagsExplained.desc <- NA
  }

  newDf[["wclass"]] <- tagsExplained.wclass
  newDf[["desc"]] <- tagsExplained.desc
    
  return(newDf)
} ## end function treetag.com()


## function dumpTextToTempfile()
# text: either plain text vector or a connection (where its content needs to be dumped to a file)
# encoding: output file encoding (will default to UTF-8 if NULL)
# pattern: tempfile name pattern
# fileext: tempfile name extension
dumpTextToTempfile <- function(text, encoding=NULL, pattern="tempTextFromObject", fileext=".txt"){
  conn.tempfile.path <- tempfile(pattern=pattern, fileext=fileext)
  # encoding can always become an issue; try to stick to UTF-8 if no other encoding was specified
  if(is.null(encoding)){
    conn.tempfile <- file(conn.tempfile.path, open="w", encoding="UTF-8")
  } else {
    conn.tempfile <- file(conn.tempfile.path, open="w", encoding=encoding)
  }
  on.exit(close(conn.tempfile))
  if(inherits(text, "connection")){
    writeLines(readLines(text, encoding=ifelse(is.null(encoding), "", encoding)), con=conn.tempfile)
  } else {
    writeLines(text, con=conn.tempfile)
  }
  return(conn.tempfile.path)
} ## end function dumpTextToTempfile()


## function stopAndStem()
# tagged.text is a data.frame from treetag() or tokenize(), to become "tokens"
stopAndStem <- function(tagged.text.df, stopwords=NULL, stemmer=NULL, lowercase=TRUE){

  if(!is.null(stopwords)){
    if(!is.character(stopwords)){
      stop(simpleError("Stopwords must be specified as a character vector!"))
    } else {}
    # treat all tokens and stopwords in lower case?
    if(isTRUE(lowercase)){
      this.token <- tolower(tagged.text.df[["token"]])
      stopwords <- tolower(stopwords)
    } else {
      this.token <- tagged.text.df[["token"]]
    }
    # check if token is a stopword, add column "lttr"
    tagged.text.df[["stop"]] <- this.token %in% stopwords
  } else {
    tagged.text.df[["stop"]] <- NA
  }

  # check for stemming
  if(inherits(stemmer, "R_Weka_stemmer_interface") || is.function(stemmer)){
    tagged.text.df[["stem"]] <- stemmer(tagged.text.df[,"token"])
  } else {
    tagged.text.df[["stem"]] <- NA
  }

  return(tagged.text.df)
} ## end function stopAndStem()


## function indexSentenceDoc()
# after stopAndStem(), add columns "idx", "sntc" and "doc_id" to data.frame
indexSentenceDoc <- function(tagged.text.df, lang, doc_id=NA){
  numTokens <- nrow(tagged.text.df)
  tagged.text.df[["idx"]] <- 1:numTokens
  sentenceEnding <- kRp.POS.tags(lang=lang, tags=c("sentc"), list.classes=TRUE)
  endedSentences <- which(tagged.text.df[["wclass"]] %in% sentenceEnding)
  if(length(endedSentences) > 0){
    # handle texts that don't end with a sentence ending
    ## TODO: be smarter here -- if the sentence is a quote, the closing quote comes *after* the fullstop
    ## we'll just ignore this for now!
    if(endedSentences[length(endedSentences)] < numTokens){
      endedSentences[length(endedSentences)] <- numTokens
    } else {}
    tagged.text.df[["sntc"]] <- unlist(sapply(
      seq_along(endedSentences),
      function(numSentence){
        if(numSentence > 1){
          return(rep(numSentence, endedSentences[numSentence] - endedSentences[numSentence - 1]))
        } else {
          return(rep(numSentence, endedSentences[numSentence]))
        }
      },
      simplify=FALSE
    ))
  } else {
    tagged.text.df[["sntc"]] <- NA
  }
  # keep a doc_id if present and new value would be NA
  if(!"doc_id" %in% colnames(tagged.text.df) | !is.na(doc_id)){
    tagged.text.df[["doc_id"]] <- factor(doc_id)
  } else if(length(unique(tagged.text.df[["doc_id"]])) > 1){
    # just check for sane doc_id
    stop(simpleError("You provided a doc_id column that included multiple IDs. This is not supported."))
  } else {}
  return(tagged.text.df)
} ## end function indexSentenceDoc()


## function tagged.txt.rm.classes()
# takes a tagged text object and returns it without punctuation or other defined
# classes or tags. can also return tokens in lemmatized form.
# 
# boolean: don't return the actual reduced data, but a logical vector indicating
#   which values (i.e. rows) would have been removed
# NOTE: "lemma" only takes effect if "as.vector=TRUE"!
tagged.txt.rm.classes <- function(txt, lemma=FALSE, lang, corp.rm.class, corp.rm.tag, as.vector=TRUE, boolean=FALSE){
  # to avoid needless NOTEs from R CMD check
  wclass <- tag <- rel.col <- NULL

  stopifnot(is.data.frame(txt))

  valid.tagset <- kRp.POS.tags(lang)
  txt.rm.tags <- c()
  if(identical(corp.rm.class, "nonpunct")){
    corp.rm.class <- kRp.POS.tags(lang, tags=c("punct", "sentc"), list.classes=TRUE)
  } else {}

  # "stopword" needs to be treated differently, it's another column
  if(any(corp.rm.class == "stopword")){
    if(all(is.na(txt[, "stop"]))){
      warning("Stopword removal not possible: All values are NA! Did you provide a stopword list when tokenizing?", call.=FALSE)
    } else {
      txt <- txt[!txt[["stop"]],]
    }
    # that's all we need -- remove the entry from the vector
    corp.rm.class <- corp.rm.class[!corp.rm.class %in% "stopword"]
  } else {}
  
  if(is.vector(corp.rm.class) && length(corp.rm.class) > 0){
    # only proceed if all class values are valid
    if(all(corp.rm.class %in% valid.tagset[, "wclass"])){
      txt.rm.tag.classes <- valid.tagset[valid.tagset[, "wclass"] %in% corp.rm.class, "tag"]
      txt.rm.tags <- unique(c(txt.rm.tags, txt.rm.tag.classes))
    } else {
      stop(simpleError("Invalid value in corp.rm.class!"))
    }
  } else {}

  if(is.vector(corp.rm.tag) && length(corp.rm.tag) > 0){
    # only proceed if all class values are valid
    if(all(corp.rm.tag %in% valid.tagset[, "tag"])){
    txt.rm.tags <- unique(c(txt.rm.tags, corp.rm.tag))
    } else {
    stop(simpleError("Invalid value in corp.rm.tag!"))
    }
  } else {}

  # in this vector, FALSE means "remove"
  rm.boolean <- !txt[["tag"]] %in% txt.rm.tags

  if(isTRUE(boolean)){
    return(rm.boolean)
  } else if(isTRUE(as.vector)){
    # return only a vetor with the tokens itself, or the whole object?
    if(isTRUE(lemma)){
      return(txt[rm.boolean, "lemma"])
    } else{
      return(txt[rm.boolean, "token"])
    }
  } else {
    return(txt[rm.boolean,])
  }
} ## end function tagged.txt.rm.classes()


## function count.sentences()
# expects txt to be an object of class kRp.text,
# and tags a vector with POS tags indicating sentence endings
count.sentences <- function(txt, tags){
  num.sentences <- length(unlist(sapply(tags, function(x){which(txt[,"tag"] == x)}), use.names=FALSE))
  return(num.sentences)
} ## end function count.sentences()


## function value.distribs()
value.distribs <- function(value.vect, omit.missings=TRUE){
  vector.length <- length(unlist(value.vect))
  # some hard to tokenize texts can end up with really strange values here,
  # exceeding the threshold of maxsum=100 for summary() on factors.
  # as a result, the as.numeric(names()) a few lines below will fail because
  # vector.summary has one last entry called "(Other)".
  # so we'll dynamically adjust maxsum to the needed value
  value.factor <- as.factor(value.vect)
  vector.summary <- summary(value.factor, maxsum=length(levels(value.factor)))

  # try to fill up missing values with 0, e.g., found no words with 5 characters
  if(!isTRUE(omit.missings)){
    if(!is.numeric(value.vect)){
      # makes only sense for numeric values
      stop(simpleError("value.distribs(): Impute missings is only valid for numeric vectors!"))
    } else {}
    present.values <- as.numeric(names(vector.summary))
    max.value <- max(present.values)
    missing.values <- c(1:max.value)[!c(1:max.value) %in% present.values]
    append.to.summary <- rep(0, length(missing.values))
    names(append.to.summary) <- as.character(missing.values)
    vector.summary <- c(vector.summary, append.to.summary)
    # finally sort the outcome
    vector.summary <- vector.summary[order(as.numeric(names(vector.summary)))]
  } else {}

  # add cumulative values
  vect.summ.cum <- cumsum(vector.summary)
  # inverse 
  vect.summ.inv <- rbind(vector.summary, vect.summ.cum, vector.length - vect.summ.cum)
  # add percentages
  vect.summ.cum.pct <- rbind(vect.summ.inv, (vect.summ.inv * 100 / vector.length))
  dimnames(vect.summ.cum.pct)[[1]] <- c("num", "cum.sum", "cum.inv", "pct", "cum.pct", "pct.inv")
  return(vect.summ.cum.pct)
} ## end function value.distribs()


## function distrib.from.fixed()
# make distribution matrices from fixed text features
distrib.from.fixed <- function(fixed.vect, all.words, idx="s", unaccounted="x"){
  num.accounted <- fixed.vect[grep(paste0(idx, "[[:digit:]]+"), names(fixed.vect))]
  num.unaccounted <- all.words - sum(num.accounted)
  num.names <- names(num.accounted)

  distrib <- value.distribs(c(unlist(sapply(num.names, function(this.val){
      this.val.num <- fixed.vect[[this.val]]
      this.val.name <- as.numeric(gsub("[^[:digit:]]", "", this.val))
      return(rep(this.val.name, this.val.num))
    })), if(num.unaccounted > 0){rep(unaccounted, num.unaccounted)}))

  return(distrib)
} ## end function distrib.from.fixed()


## function distrib.to.fixed()
# the other way round, to be able to use the desc slot with readability.num()
distrib.to.fixed <- function(distrib, all.values, idx="s"){
  values <- distrib["num",]
  names(values) <- paste0(idx, colnames(distrib))
  values <- c(all=all.values, values)
  return(values)
} ## end function distrib.to.fixed()


## function text.analysis()
# "txt" must be a tagged and commented text object!
text.analysis <- function(txt, lang, corp.rm.class, corp.rm.tag, desc){
  ## global stuff
  # count sentences
  txt.stend.tags <- kRp.POS.tags(lang, list.tags=TRUE, tags="sentc")
  txt.vector <- as.vector(txt[,"token"])

  txt.nopunct <- tagged.txt.rm.classes(txt, lemma=FALSE, lang=lang, corp.rm.class=corp.rm.class, corp.rm.tag=corp.rm.tag)

  txt.lemma <- tagged.txt.rm.classes(txt, lemma=TRUE, lang=lang, corp.rm.class=corp.rm.class, corp.rm.tag=corp.rm.tag)
  txt.lemma.types <- unique(txt.lemma)
  ## end global stuff

  ## statistics on word classes
  word.classes <- kRp.POS.tags(lang, list.classes=TRUE)
  # true/false-matrix for word classes
  class.matrix <- sapply(word.classes, function(x){txt[,"wclass"] == x})
  class.num <- colSums(class.matrix)
  # types lemmata
  class.matrix.types <- sapply(word.classes, function(x){length(unique(txt[,"token"][class.matrix[,x]]))})
  # types lemmatized
  class.matrix.lemma <- sapply(word.classes, function(x){length(unique(txt[,"lemma"][class.matrix[,x]]))})

  # counting classes
  class.analysis <- sapply(word.classes, function(wd.class){
      txt.num.class <- txt[,"wclass"] == wd.class
      assign(wd.class, txt.vector[txt.num.class])
      }
    )

  # length of sentences
  # for this, first we need the text with only words and sentence ending punctuation
  txt.sentc.ends <- txt[txt[,"tag"] %in% kRp.POS.tags(lang=lang, list.tags=TRUE, tags=c("words", "sentc")), ]
  sentence.index <- c(0, which(txt.sentc.ends[,"tag"] %in% txt.stend.tags))
  ## note: if the last sentence didn't end with some punctuation, it's not counted as a sentence at all!
  if(identical(sentence.index, 0)){
    # seems there are no sentences at all
    # we'll take the number of all words as an estimate
    sentence.lengths <- length(txt[txt[,"tag"] %in% kRp.POS.tags(lang=lang, list.tags=TRUE, tags=c("words")), ])
  } else {
    sentence.lengths <- sapply(2:length(sentence.index), function(sntc.ends.here){
        entcs.words <- sentence.index[sntc.ends.here] - sentence.index[sntc.ends.here - 1] - 1
        return(entcs.words)
      })
  }

  ## further descriptives
  num.questions <- sum(txt.vector %in% "?")
  num.exclamats <- sum(txt.vector %in% "!")
  num.semicolon <- sum(txt.vector %in% ";")
  num.colon     <- sum(txt.vector %in% ":")

  add.results <- list(
              all.words=txt.nopunct,
              all.lemmata=txt.lemma.types,
              classes=class.analysis,
              lemmata=length(txt.lemma.types),
              freq.token=class.num,
              freq.types=class.matrix.types,
              freq.lemma=class.matrix.lemma,
              sentc.length=sentence.lengths,
              sentc.distrib=value.distribs(sentence.lengths),
              questions=num.questions,
              exclam=num.exclamats,
              semicolon=num.semicolon,
              colon=num.colon
          )
  results <- append(add.results, desc)
  return(results)
} ## end function text.analysis()


## function word.freq()
# takes singe character strings or vectors and looks up their appearance frequency in corpora data
# "rel" can be used to define the interesting relation:
# "pct", "pmio", "rank.avg", "rank.min", "inDocs" or "idf"
word.freq <- function(txt, corp.freq, rel, zero.NAs=FALSE){
  # some basic sanity checks
  stopifnot(inherits(corp.freq, "kRp.corp.freq"))
  if(!is.character(txt)){
    stop(simpleError("Word frequency analysis can only be run on character data!"))
  } else {}

  if(!all(rel %in% c("pct", "pmio", "log10", "rank.avg", "rank.min", "rank.rel.avg", "rank.rel.min", "inDocs", "idf"))){
    stop(simpleError(paste0("Option \"rel\" must be \"pct\", \"pmio\", \"log10\", \"rank.avg\", \"rank.min\", \"rank.rel.avg\",\n",
      "\"rank.rel.min\", \"inDocs\" or \"idf\"!")))
  } else {}

  corp.index <- match(txt, corp.freq@words$word)
  results <- corp.freq@words[corp.index, rel]

  if(isTRUE(zero.NAs) & any(is.na(results))){
    if(inherits(results, "data.frame")){
      rowsWithNAs <- unique(which(is.na(results), arr.ind=TRUE)[,"row"])
      results[rowsWithNAs,] <- rep(0, ncol(results))
    } else {
      results[is.na(results)] <- 0
    }
  } else {}
  return(results)
} ## end function word.freq()


## function type.freq()
# this function will identify unique types in a tagged text object
# and count how often it appears in the text
# - txt: must be a tagged text object
# - vector.only: if the numbers and letters are not needed, return the vector of types an quit
type.freq <- function(txt, case.sens=TRUE, verbose=FALSE, lemma=FALSE, fail.if.no.lemmas=TRUE, vector.only=FALSE){
  # shall we count all tokens or their lemmas?
  if(isTRUE(lemma)){
    # do a sanity check, are lemmata present?
    if(identical(unique(txt[,"lemma"]), "")){
      if(isTRUE(fail.if.no.lemmas)){
        stop(simpleError(paste0("You asked to use lemmata, but your text object doesn't include any!",
          "\n  This is probably the case because you used tokenize() instead of treetag().")))
      } else {
        return(NULL)
      }
    } else {
      relevant.tokens <- "lemma"
    }
  } else {
    relevant.tokens <- "token"
  }

  all.tokens <- txt[,c(relevant.tokens,"lttr")]
  if(!isTRUE(case.sens)){
    all.tokens[[relevant.tokens]] <- tolower(all.tokens[[relevant.tokens]])
  } else {}
  corp.freq <- unique(all.tokens)
  colnames(corp.freq) <- c("type","lttr")
  if(isTRUE(vector.only)){
    return(corp.freq[["type"]])
  } else {
    corp.freq[["freq"]] <- 0
    num.tokens <- nrow(all.tokens)
    num.types <- nrow(corp.freq)
    if(isTRUE(verbose)){
      type.counter <- 1
    } else {}
    for (tp in seq_along(corp.freq[["type"]])){
      corp.freq[tp, "freq"] <- sum(all.tokens[[relevant.tokens]] %in% corp.freq[tp, "type"], na.rm=TRUE)
      if(isTRUE(verbose)){
        message(paste0("\t", floor(100*type.counter/num.types), "% complete, processing ", relevant.tokens, " ", type.counter, " of ", num.types, ": \"", corp.freq[tp, "type"], "\""))
        message(paste0(" (found ", corp.freq[tp, "freq"], " times in ", num.tokens, " ", relevant.tokens, "s)\n"))
        type.counter <- type.counter + 1
      } else {}
    }
    # order results
    corp.freq <- corp.freq[order(corp.freq[,"freq"], corp.freq[,"lttr"], decreasing=TRUE),]
    rownames(corp.freq) <- NULL
  }
  return(corp.freq)
} ## end function type.freq()


## function create.corp.freq.object()
# this function should be used to create corpus frequency objects
# the idea is to have this one function so that any kind of corpora data
# can be squeezed into the format we want.
# "matrix.freq" needs to be a matrix with three columns:
#     "num" (some ID), "word" (the actual running word form) and "freq" (absolute frequency).
#     optional columns are "lemma", "tag", "wclass", "inDocs" and "idf".
# "df.meta" must be a data.frame with two columns: "meta" (name of meta information) and its "value".
# "dscrpt.meta" must be a data.frame with six columns: "tokens" (old: "words"), "types" (old: "dist.words"),
#   "words.p.sntc", "chars.p.sntc", "chars.p.wform" and "chars.p.word"; if NULL its value is set to an empty default
# "extra.cols" is an optional data.frame with additional columns, e.g. valence data
# "caseSens" determines whether frequency stats should distinguish between letter cases or consolidate otherwise identical tokens
create.corp.freq.object <- function(matrix.freq, num.running.words, df.meta, df.dscrpt.meta,
  matrix.table.bigrams=NULL, matrix.table.cooccur=NULL, extra.cols=NULL, caseSens=TRUE, quiet=FALSE){
  tokenFreq <- as.numeric(matrix.freq[,"freq"])
  if(!isTRUE(caseSens)){
    # first look for tokens that only differ in letter case and
    # replace freq with sum of all occurances
    lowerTokens <- tolower(matrix.freq[,"word"])
#     caseSensTokens <- unique(lowerTokens[duplicated(lowerTokens)])
#     caseSensFreq <- as.list(rep(0, length(caseSensTokens)))
#     names(caseSensFreq) <- caseSensTokens
#     caseSensFreq <- as.environment(caseSensFreq)
    caseSensFreq <- new.env()
    if(!isTRUE(quiet)){
      num.tokens <- length(lowerTokens) # length(caseSensTokens)
      message(paste0("Re-calculating token frequencies (case insensitve, ", num.tokens, " tokens)"))
      # just some feedback, so we know the machine didn't just freeze...
      prgBar <- txtProgressBar(min=0, max=num.tokens, style=3)
    } else {}

    start.with <- new.env()
    start.with[["count"]] <- 1
    # we don't really need this object, sapply updates the environment caseSensFreq
    tmp <- sapply(
      seq_along(tokenFreq),
      function(numToken){
        if(!isTRUE(quiet)){
          # update progress bar
          setTxtProgressBar(prgBar, start.with[["count"]])
        } else {}
        thisLowToken <- lowerTokens[numToken]
        if(is.null(caseSensFreq[[thisLowToken]])){
          caseSensFreq[[thisLowToken]] <- tokenFreq[numToken]
        } else {
          caseSensFreq[[thisLowToken]] <- caseSensFreq[[thisLowToken]] + tokenFreq[numToken]
        }
        start.with[["count"]] <- start.with[["count"]] + 1
      }
    )
    tokenFreq <- unlist(sapply(
      lowerTokens,
      function(thisToken){
        caseSensFreq[[thisToken]]
      },
      USE.NAMES=FALSE
    ))
    if(!isTRUE(quiet)){
      setTxtProgressBar(prgBar, num.tokens)
      close(prgBar)
    } else {}
  } else {}

  # try to work around missing meta information
  if(is.na(num.running.words)){
    num.running.words <- sum(tokenFreq)
  }
  # calculate rank data
  rank.avg <- rank(tokenFreq, ties.method="average")
  rank.min <- rank(tokenFreq, ties.method="min")
  # for better comparability, compute relative ranks
  # can take values between 0 and 100
  rank.rel.avg <- (rank.avg / max(rank.avg)) * 100
  rank.rel.min <- (rank.min / max(rank.min)) * 100

  words.per.mio <- tokenFreq %/% (num.running.words/1000000)
  log10.per.mio <- log10(words.per.mio)
  # correct for lowest frequency words and log10(0), which returns -Inf
  log10.per.mio[log10.per.mio < 0] <- 0

  # check if the optional columns "lemma", "tag", "wclass", "inDocs" and "idf" are present
  if("lemma" %in% colnames(matrix.freq)){
    have.lemma <- matrix.freq[,"lemma"]
  } else {
    have.lemma <- NA
  }
  if("tag" %in% colnames(matrix.freq)){
    have.tag <- matrix.freq[,"tag"]
  } else {
    have.tag <- NA
  }
  if("wclass" %in% colnames(matrix.freq)){
    have.wclass <- matrix.freq[,"wclass"]
  } else {
    have.wclass <- NA
  }
  if("inDocs" %in% colnames(matrix.freq)){
    have.inDocs <- matrix.freq[,"inDocs"]
  } else {
    have.inDocs <- NA
  }
  if("idf" %in% colnames(matrix.freq)){
    have.idf <- matrix.freq[,"idf"]
  } else {
    have.idf <- NA
  }
  df.words <- data.frame(
              num=as.numeric(matrix.freq[,"num"]),
              word=matrix.freq[,"word"],
              lemma=have.lemma,
              tag=have.tag,
              wclass=have.wclass,
              lttr=nchar(matrix.freq[,"word"], type="width", allowNA=TRUE),
              freq=tokenFreq,
              pct=tokenFreq/num.running.words,
              pmio=words.per.mio,
              log10=log10.per.mio,
              rank.avg=rank.avg,
              rank.min=rank.min,
              rank.rel.avg=rank.rel.avg,
              rank.rel.min=rank.rel.min,
              inDocs=have.inDocs,
              idf=have.idf,
              stringsAsFactors=FALSE)
  # add extra columns, if given
  if(!is.null(extra.cols)){
    df.words <- cbind(df.words, extra.cols, stringsAsFactors=FALSE)
  } else {}
  # set missing information to a valid defaults
  if(is.null(df.dscrpt.meta)){
    df.dscrpt.meta <- slot(kRp_corp_freq(), "desc")
  } else {}
  if(is.null(df.meta)){
    df.meta <- slot(kRp_corp_freq(), "meta")
  } else {}
  if(is.null(matrix.table.bigrams)){
    df.table.bigrams <- slot(kRp_corp_freq(), "bigrams")
  } else {
    message("Fetching bigram tokens from data... ", appendLF=FALSE)
    df.table.bigrams <- data.frame(
      token1=df.words[
        sapply(
          as.numeric(matrix.table.bigrams[,"token1"]),
          function(x){
            which(df.words[,"num"] == x)
          }
        ), "word"],
      token2=df.words[
        sapply(
          as.numeric(matrix.table.bigrams[,"token2"]),
          function(x){
            which(df.words[,"num"] == x)
          }
        ), "word"],
      freq=as.numeric(matrix.table.bigrams[,"freq"]),
      sig=as.numeric(matrix.table.bigrams[,"sig"])
    )
    # sort by frequency
    message("sorting... ", appendLF=FALSE)
    df.table.bigrams <- df.table.bigrams[with(df.table.bigrams, order(freq, decreasing=TRUE)),]
    message("done.")
  }
  if(is.null(matrix.table.cooccur)){
    df.table.cooccur <- slot(kRp_corp_freq(), "cooccur")
  } else {
    message("Fetching co-occurrence tokens from data... ", appendLF=FALSE)
    df.table.cooccur <- data.frame(
      token1=df.words[
        sapply(
          as.numeric(matrix.table.cooccur[,"token1"]),
          function(x){
            which(df.words[,"num"] == x)
          }
        ), "word"],
      token2=df.words[
        sapply(
          as.numeric(matrix.table.cooccur[,"token2"]),
          function(x){
            which(df.words[,"num"] == x)
          }
        ), "word"],
      freq=as.numeric(matrix.table.cooccur[,"freq"]),
      sig=as.numeric(matrix.table.cooccur[,"sig"])
    )
    # sort by frequency
    message("sorting... ", appendLF=FALSE)
    df.table.cooccur <- df.table.cooccur[with(df.table.cooccur, order(freq, decreasing=TRUE)),]
    message("done.")
  }
  results <- kRp_corp_freq(
    meta=df.meta,
    words=df.words,
    desc=df.dscrpt.meta,
    bigrams=df.table.bigrams,
    cooccur=df.table.cooccur,
    caseSens=caseSens
  )
  return(results)
} ## end function create.corp.freq.object()


## function noInf.summary()
# helper function to produce characteristics summaries without infinite values
# used in the show methods
# data must be a vector
noInf.summary <- function(data, add.sd=FALSE){
  data[is.infinite(data)] <- NA
  print(summary(data))
  if(isTRUE(add.sd)){
    cat("   SD\n ", round(sd(data, na.rm=TRUE), digits=4), "\n", sep="")
  } else {}
} ## end function noInf.summary()


## function is.supported.lang()
# determins if a language is supported, and returns the correct identifier
is.supported.lang <- function(lang.ident, support="treetag"){
  if(identical(support, "treetag")){
    treetag.supported <- as.list(as.environment(.koRpus.env))[["langSup"]][["kRp.POS.tags"]][["tags"]]
    if(lang.ident %in% names(treetag.supported)){
      res.ident <- lang.ident
    } else {
      stop(simpleError(
        paste0(
          "Unknown tag definition requested: ", lang.ident, "\n",
          "See ?available.koRpus.lang() for a list of supported languages."
        )
      ))
    }
  } else {}

  return(res.ident)
} ## end function is.supported.lang()


## function txt.compress()
# will gzip an object and return file size and compression ratio
# can be used to estimate redundancy on a global level
txt.compress <- function(obj, level=9, ratio=FALSE, in.mem=TRUE){

  if(is.character(obj)){
    txt <- obj
  } else if(inherits(obj, "kRp.text")){
    txt <- pasteText(obj)
  } else {
    stop(simpleError("Cannot compress objects which are neither character nor of a koRpus class!"))
  }

  if(isTRUE(in.mem)){
    zz.gz <- memCompress(txt, "gzip")
    zz.gz.size <- object.size(zz.gz)
    if(isTRUE(ratio)){
      zz.non <- memCompress(txt, "none")
      zz.non.size <- object.size(zz.non)
      ratio <- as.numeric(zz.gz.size / zz.non.size)
    } else {
      zz.non.size <- NA
      ratio <- NA
    }
  } else {
    tmp.path <- tempfile("koRpus.gz")
      if(!dir.create(tmp.path, recursive=TRUE)) stop(simpleError("Compression skipped: Can't create temporary directory!"))
    # if the function is done, remove the tempdir
    on.exit(unlink(tmp.path, recursive=TRUE))

    # (probably) create two connection, one compressed
    zz.gz <- gzfile(file.path(tmp.path, "bloat.gz"), open="w", compression=level)
    writeLines(txt, zz.gz, sep=" ")
    close(zz.gz)
    zz.gz.size <- file.info(file.path(tmp.path, open="bloat.gz"))$size
    if(isTRUE(ratio)){
      zz.non <- file(file.path(tmp.path, "bloat"), open="w")
      writeLines(txt, zz.non, sep=" ")
      close(zz.non)
      zz.non.size <- file.info(file.path(tmp.path, open="bloat"))$size
      ratio <- zz.gz.size / zz.non.size
    } else {
      zz.non.size <- NA
      ratio <- NA
    }
  }

  results <- list(file.size=zz.non.size, gz.size=zz.gz.size, ratio=ratio)

  return(results)
} ## end function txt.compress()


## function read.udhr()
# this function will read the Universal Declaration of Human Rights from text files by
# the UDHR in Unicode project: https://unicode.org/udhr/downloads.html
# txt.path must be a character string pinting to the directory where the bulk files
# were extracted, or to the ZIP file.
read.udhr <- function(txt.path, quiet=TRUE){

  # check if txt.path is a zip file
  if(!as.logical(file_test("-d", txt.path))){
    if(file.exists(txt.path) & grepl("(\\.zip|\\.ZIP)$", txt.path)){
      # ok, seems to be an existing zip file
      # unpack it to a temporary location and alter variable to use that location
      tmp.path <- tempfile("koRpus.UDHR")
      if(!dir.create(tmp.path, recursive=TRUE)) stop(simpleError("UDHR: Can't create temporary directory!"))
      # if the function is done, remove the tempdir
      on.exit(unlink(tmp.path, recursive=TRUE))
      unzip(txt.path, exdir=tmp.path)
      udhr.path <- tmp.path
    } else {
      stop(simpleError(paste("Cannot access UDHR location:", txt.path)))
    }
  } else {
    udhr.path <- txt.path
  }

  #   # requires package XML
  #   udhr.xml <- xmlParse(file.path(udhr.path, "index.xml"))
  #   udhr.list <- xmlSApply(xmlRoot(udhr.xml), xmlAttrs)
  ## since there was no windows package XML for R 2.13, this is a primitive parser which does the job:
  udhr.XML <- readLines(file.path(udhr.path, "index.xml"), warn=FALSE)
  # remove comments
  udhr.XML <- gsub("[[:space:]]*<!--(.*)-->", "", udhr.XML[grep("<udhr ", udhr.XML)])
  # filter out only the interesting parts
  udhr.XML <- gsub("([[:space:]]+<udhr )(.*)/>", "\\2", udhr.XML, perl=TRUE)
  # split vector into a list with ell elements; then we have basically what xmlParse() and xmlSApply() returned
  udhr.XML <- gsub("([[:alnum:]]+)=('[^']+'|'')[[:space:]]+", "\\1=\\2#", udhr.XML, perl=TRUE)
  # as a safety measure, put iso639-3 in quotes
  udhr.XML <- gsub("#iso639-3=", "#\"iso639-3\"=", udhr.XML)
  udhr.XML.list <- strsplit(udhr.XML, split="#")
  udhr.list <- lapply(seq_along(udhr.XML.list), function(cur.entry){eval(parse(text=paste0("c(", paste(udhr.XML.list[[cur.entry]], collapse=", "), ")")))})

  names(udhr.list) <- seq_along(udhr.list)
  # correct for missing values and variables
  udhr.list.corr <- sapply(udhr.list, function(udhr.entry){
      # older index files split up l+v and n+nv, newer have combined f and n elements
      if("f" %in% names(udhr.entry)){
        udhr.entry[["file"]] <- file.path(udhr.path, paste0("udhr_", udhr.entry[["f"]], ".txt"))
        udhr.entry[["name"]] <- udhr.entry[["n"]]
      } else if("l" %in% names(udhr.entry)){
        if(!"v" %in% names(udhr.entry)){
          udhr.entry[["v"]] <- NA
          udhr.entry[["file"]] <- file.path(udhr.path, paste0("udhr_", udhr.entry[["l"]], ".txt"))
        } else {
          udhr.entry[["file"]] <- file.path(udhr.path, paste0("udhr_", udhr.entry[["l"]], "_", udhr.entry[["v"]], ".txt"))
        }
        if(!"nv" %in% names(udhr.entry)){
          udhr.entry[["nv"]] <- NA
          udhr.entry[["name"]] <- udhr.entry[["n"]]
        } else {
          udhr.entry[["name"]] <- paste0(udhr.entry[["n"]], " (",udhr.entry[["nv"]],")")
        }
      } else {
        stop(simpleError("Erm, sorry -- looks like the format if the index.xml file has changed, please file a bug report!"))
      }
      # now try to load the translated text from file and add it to the entry
      udhr.file <- udhr.entry[["file"]]
      if(file.exists(udhr.file)){
        # remove the trailing notice also, because it will keep gzip from
        # optimizing for the actual charset
        udhr.entry[["text"]] <- gsub("^.*udhr. --- ", "", paste(suppressWarnings(scan(udhr.file, what=character(), quiet=quiet)), collapse=" "))
      } else {
        udhr.entry[["text"]] <- NA
      }
      # reorder again, otherwise t() will result in strange outcomes...
      entry.names <- sort(names(udhr.entry))
      return(udhr.entry[entry.names])
    }
  )

  udhr.df.corr <- as.data.frame(t(udhr.list.corr), stringsAsFactors=FALSE)
  # remove entries without a text to compare
  results <- udhr.df.corr[!is.na(udhr.df.corr[,"text"]), ]

  return(results)
} ## end function read.udhr()


## function text.1st.letter()
# changes the first letter of a word to upper or lower case
# if case="change", the present case will be switched to the other
text.1st.letter <- function(word, case){

  results <- sapply(word, function(cur.word){
    word.vector <- unlist(strsplit(cur.word, split=""))

    if(identical(case, "upper")){
      word.vector[1] <- toupper(word.vector[1])
    } else{}
    if(identical(case, "lower")){
      word.vector[1] <- tolower(word.vector[1])
    } else{}
    if(identical(case, "change")){
      if(isTRUE(grepl("([[:lower:]])", word.vector[1]))){
        word.vector[1] <- toupper(word.vector[1])
      } else if(isTRUE(grepl("([[:upper:]])", word.vector[1]))){
        word.vector[1] <- tolower(word.vector[1])
      } else {}
    } else{}

    word.new <- paste(word.vector, collapse="")
    return(word.new)
    }, USE.NAMES=FALSE)

  return(results)
} ## end function text.1st.letter()


## function taggz()
# takes a vector of tokens and adds internal tags
# unicode escapes:
#   <e2><80><99> -> \u2019 right single quotation mark
#   <e2><80><93> -> \u2013 en dash
#' @importFrom data.table data.table :=
taggz <- function(tokens, abbrev=NULL, heur.fix=list(pre=c("\u2019","'"), suf=c("\u2019","'")), ign.comp="", sntc=c(".","!","?",";",":")){
  # make R CMD check happy...
  token <- tag <- NULL
  tagged.text <- data.table(token=tokens, tag="unk.kRp")
  Encoding(tagged.text$token) <- "UTF-8"
  Encoding(tagged.text$tag) <- "UTF-8"
  tagged.text[token == "" & tag == "unk.kRp", tag := "unk"]
  # all letters, assume it's a word
  tagged.text[!grepl(paste("[^\\p{L}\\p{M}",paste(ign.comp, collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := "word.kRp"]
  # all digits, assume it's a number
  tagged.text[!grepl(paste("[^\\p{N}",paste(ign.comp, collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := "no.kRp"]
  # assume it's an abbreviation
  tagged.text[token %in% abbrev & tag == "unk.kRp", tag := "abbr.kRp"]
  # assume it's a sentence ending
  tagged.text[token %in% sntc & tag == "unk.kRp", tag := ".kRp"]
  # all dots
  tagged.text[!grepl("[^.]", token, perl=TRUE) & tag == "unk.kRp", tag := "-kRp"]
  tagged.text[token == "," & tag == "unk.kRp", tag := ",kRp"]
  tagged.text[!grepl(paste("[^\\p{Ps}",paste(ign.comp, collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := "(kRp"]
#       } else if(tkn %in% c("(","{","[")){
#         return(c(token=tkn, tag="(kRp"))
  tagged.text[!grepl(paste("[^\\p{Pe}",paste(ign.comp, collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := ")kRp"]
#       } else if(tkn %in% c(")","}","]")){
#         return(c(token=tkn, tag=")kRp"))
#       } else if(tkn %in% c("-","\u2013")){
#         return(c(token=tkn, tag="-kRp"))
  tagged.text[token %in% c("\"","'","''","`","``","\u2019","\u2019\u2019") & tag == "unk.kRp", tag := "''kRp"]
  tagged.text[!grepl(paste("[^\"\\p{Pi}\\p{Pf}",paste(ign.comp, collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := "''kRp"]
  tagged.text[!grepl(paste("[^\\p{Pd}",paste(ign.comp, collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := "-kRp"]
  # any other punctuation
  tagged.text[!grepl(paste("[^\\p{P}",paste(ign.comp, collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := "-kRp"]
  # simple heuristics for abbreviations
  tagged.text[!grepl(paste("[^\\p{L}\\p{M}.",paste(ign.comp, collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := "abbr.kRp"]
  # simple heuristics for pre- and suffixes
  tagged.text[!grepl(paste("[^\\p{L}\\p{M}\\p{N}",paste(unique(unlist(heur.fix), ign.comp), collapse=""),"]"), token, perl=TRUE) & tag == "unk.kRp", tag := "word.kRp"]
  # automatic healine or paragraph detection:
  tagged.text[token == "<kRp.h>" & tag == "unk.kRp", tag := "hon.kRp"]
  tagged.text[token == "</kRp.h>" & tag == "unk.kRp", tag := "hoff.kRp"]
  tagged.text[token == "<kRp.p/>" & tag == "unk.kRp", tag := "p.kRp"]

  tagged.text <- as.matrix(tagged.text)
  
  return(tagged.text)
} ## end function taggz()


## function tokenz()
# a simple tokenizer. txt must be a character vector with stuff to tokenize
# will return a vector with one token per element
# 'abbrev' must be a text file encoded in 'encoding', with one abbreviation per line
tokenz <- function(txt, split="[[:space:]]", ign.comp="-", heuristics="abbr", abbrev=NULL,
        encoding="unknown", heur.fix=list(pre=c("\u2019","'"), suf=c("\u2019","'")), tag=FALSE,
        sntc=c(".","!","?",";",":"), detect=c(parag=FALSE, hline=FALSE)){

  ## clean ign.comp for use in regexp
  ign.comp.rex <- ign.comp
  ign.comp.rex[ign.comp.rex %in% "-"] <- "\\-"
  ign.comp.rex <- paste(ign.comp.rex, collapse="")

  ## prep the text
  # if headlines and paragraphs should be autodetected
  if(any(detect)){
    hline <- ifelse("hline" %in% names(detect), detect[["hline"]], FALSE)
    parag <- ifelse("parag" %in% names(detect), detect[["parag"]], FALSE)
    lines.empty <- grepl("^([[:space:]]*)$", txt)
    if(isTRUE(hline)){
      # lines without any sentence ending punctuation are considered headlines
      headlines <-  !lines.empty & !grepl("[.;:!?]", txt) & !grepl(",$", txt)
      txt[headlines] <- paste("<kRp.h>", txt[headlines], "</kRp.h>", sep=" ")
    } else {}
    if(isTRUE(parag)){
      paragraphs <- lines.empty & c(FALSE, !lines.empty[-length(lines.empty)])
      if(isTRUE(hline)){
        # if the previous entry was a headline, don't tag a paragraph as well
        paragraphs <- paragraphs & c(FALSE, !headlines[-length(headlines)])
      } else {}
      txt[paragraphs] <- "<kRp.p/>"
    } else {}
  } else {}

  tk.pre.lst <- unlist(strsplit(txt, split))

  # check for abbreviations to consider
  if(!is.null(abbrev)){
    check.file(abbrev, mode="exist")
    abbrev.vect <- readLines(abbrev, encoding=encoding)
  } else {
    abbrev.vect <- NULL
  }

  # call our own simple tokenizer
  # it should split strings at every non-letter character, with the
  # exception of a defined word-binding (like "-")
  tokenized.text <- as.vector(unlist(sapply(tk.pre.lst, function(tkn){
      # in case of empty elements, skip to the next one
      if(identical(tkn,"")){
        return()
      } else {}
      # if automatic healine or paragraph detection was used, filter their tags
      if(tkn %in% c("<kRp.h>", "</kRp.h>", "<kRp.p/>")){
        return(tkn)
      } else {}
      # if it's all just letters and numbers, leave as is
      if(!grepl("[^\\p{L}\\p{M}\\p{N}]", tkn, perl=TRUE)){
        return(tkn)
      } else {
        # first, split off escaped quotes
        if(grepl("\"", tkn, perl=TRUE)){
          tkn <- gsub("(\")([^\\s])", "\\1 \\2", tkn, perl=TRUE)
          tkn <- gsub("([^\\s])(\")", "\\1 \\2", tkn, perl=TRUE)
        } else {}
        # create a vector for later exclusion if a pre/suffix heuristic was chosen and already applied
        tkn.nonheur <- tkn
        all.fixes <- c()
        # for pre- and suffixes, set number of occurring letters
        pre.max.num <- "" # the empty default reads a "exactly one letter"
        suf.max.num <- "{1,2}"
        # see what we need to check for
        check.suffix <- ifelse(any(c("en", "fr", "suf") %in% heuristics), TRUE, FALSE)
        check.prefix <- ifelse(any(c("fr", "pre") %in% heuristics), TRUE, FALSE)
        if("fr" %in% heuristics){
          # in french longer suffixes like "'elle" are common
          suf.max.num <- "+"
        } else {}
        # probably split off french prefixes like l'animal
        if(isTRUE(check.prefix)){
          stopifnot(length(heur.fix$pre) > 0)
          heur.prefix <- paste0(heur.fix$pre, collapse="")
          # also take care of cases where there's non-letter stuff before the prefix or after the prefixed word
          tkn <- gsub(paste0("(^[\\p{Z}\\p{S}\\p{P}]*)([\\p{L}\\p{M}\\p{N}]", pre.max.num, "[", heur.prefix, "])([\\p{L}\\p{M}\\p{N}]+)([\\p{Z}\\p{S}\\p{P}]*$)"),
            "\\1 \\2 \\3 \\4", tkn, perl=TRUE)
          all.fixes <- unique(c(all.fixes, heur.prefix))
          tkn.nonheur <- gsub(paste0("[",all.fixes,"]", collapse=""), "", tkn)
        } else {}
        # the same for possessive 's and the like
        if(isTRUE(check.suffix)){
          stopifnot(length(heur.fix$suf) > 0)
          heur.suffix <- paste0(heur.fix$suf, collapse="")
          # also take care of cases where there's non-letter stuff before the suffix or after the suffixed word
          tkn <- gsub(paste0("(^[\\p{Z}\\p{S}\\p{P}]*)([\\p{L}\\p{M}\\p{N}]+)([", heur.suffix, "][\\p{L}\\p{M}\\p{N}]", suf.max.num, ")([\\p{Z}\\p{S}\\p{P}]*$)"),
            "\\1 \\2 \\3 \\4", tkn, perl=TRUE)
          all.fixes <- unique(c(all.fixes, heur.suffix))
          tkn.nonheur <- gsub(paste0("[",all.fixes,"]", collapse=""), "", tkn)
        } else {}
        # check for possible abbreviations
        if(is.null(abbrev.vect) | !tkn %in% abbrev.vect){
          # do the heuristics
          # currently, single letters followed by a dot will be taken for an abbreviated name;
          # the former implementation demanded at least two appearances of that:
          #if("abbr" %in% heuristics & grepl("(\\p{L}\\p{M}*\\.){2,}", tkn, perl=TRUE)){
          if("abbr" %in% heuristics & grepl("(^\\p{P}*\\p{L}\\p{M}*\\.)", tkn, perl=TRUE)){
            # separate closing/opening brackets and dashes
            tkn <- gsub("([\\p{Pe}\\p{Ps}\\p{Pd}])([\\p{L}\\p{M}])", "\\1 \\2", tkn, perl=TRUE)
            tkn <- gsub("(\\.)([^\\p{L}\\p{M}])", "\\1 \\2", tkn, perl=TRUE)
          # take care of probably already applied prefix/suffix heuristics here:
          } else if(grepl("([\\p{L}\\p{M}\\p{N}]+)([^\\p{L}\\p{M}\\p{N}\\s]+)|([^\\p{L}\\p{M}\\p{N}\\s]+)([\\p{L}\\p{M}\\p{N}]+)", tkn.nonheur, perl=TRUE)){
            # this should be some other punctuation or strange stuff...
            tkn <- gsub(paste0("([^\\p{L}\\p{M}\\p{N}",ign.comp.rex,"])([\\p{L}\\p{M}\\p{N}])"), "\\1 \\2", tkn, perl=TRUE)
            tkn <- gsub(paste0("([\\p{L}\\p{M}\\p{N}])([^\\p{L}\\p{M}\\p{N}",ign.comp.rex,"])"), "\\1 \\2", tkn, perl=TRUE)
          } else {}
          # is there clusters of undefined nonword stuff left?
          if(grepl("([^\\p{L}\\p{M}\\p{N}\\s]{2,})", tkn, perl=TRUE)){
            # as long as it's not dots:
            tkn <- gsub(paste0("([^\\p{Zs}])([^\\p{L}\\p{M}\\p{N}.",ign.comp.rex,"\\p{Zs}])"), "\\1 \\2", tkn, perl=TRUE)
            tkn <- gsub(paste0("([^\\p{L}\\p{M}\\p{N}.",ign.comp.rex,"\\p{Zs}])([^\\p{Zs}])"), "\\1 \\2", tkn, perl=TRUE)
            # keep "..." intact
            tkn <- gsub("([^\\p{L}\\p{M}\\p{N}.\\s])([.])", "\\1 \\2", tkn, perl=TRUE)
          } else {}
        } else {}
        new.tkn <- unlist(strsplit(tkn, " "))
        # remove empty elements
        new.tkn <- new.tkn[!new.tkn %in% ""]
        return(new.tkn)
      }
    }, USE.NAMES=FALSE)))

  if(isTRUE(tag)){
    tokenized.text <- taggz(tokenized.text, abbrev=abbrev.vect, heur.fix=heur.fix, sntc=sntc, ign.comp=ign.comp)
  } else {}

  return(tokenized.text)
} ## end function tokenz()


## function queryList()
queryList <- function(obj, var, query, rel, as.df, ignore.case, perl, regexp_var){
  this.query <- query[[1]]
  this.query.vars <- names(this.query)
  this.q.var <- this.query.vars[[1]]
  this.q.query <- this.query[[1]]
  this.q.rel <- ifelse("rel" %in% this.query.vars, this.query[["rel"]], rel)
  this.q.ignore.case <- ifelse("ignore.case" %in% this.query.vars, this.query[["ignore.case"]], ignore.case)
  this.q.perl <- ifelse("perl" %in% this.query.vars, this.query[["perl"]], perl)
  if(length(query) == 1){
    obj <- query(obj=obj, var=this.q.var, query=this.q.query, rel=this.q.rel, as.df=as.df, ignore.case=this.q.ignore.case, perl=this.q.perl, regexp_var=regexp_var)
  } else {
    remaining.queries <- query[-1]
    remaining.obj <- query(obj=obj, var=this.q.var, query=this.q.query, rel=this.q.rel, as.df=FALSE, ignore.case=this.q.ignore.case, perl=this.q.perl, regexp_var=regexp_var)
    obj <- query(obj=remaining.obj, var=var, query=remaining.queries, rel=rel, as.df=as.df, ignore.case=ignore.case, perl=perl, regexp_var=regexp_var)
  }
  return(obj)
} ## end function queryList()


## function headLine()
# takes text and builds a border around it
headLine <- function(txt, level=1){
  if(identical(level, 1)){
    headlineTxt <- paste0("## ", txt, " ##")
    headlineLine <- paste(rep("#", nchar(headlineTxt, type="width")), collapse="")
    headlineFull <- paste0(headlineLine, "\n", headlineTxt, "\n", headlineLine)
  } else if(identical(level, 2)){
    headlineLine <- paste(rep("=", nchar(txt, type="width")), collapse="")
    headlineFull <- paste0(txt, "\n", headlineLine)
  } else {
    headlineLine <- paste(rep("-", nchar(txt, type="width")), collapse="")
    headlineFull <- paste0(txt, "\n", headlineLine)
  }
  return(headlineFull)
} ## end function headLine()


## function matching.lang()
# helper function to match language definitions,
# called by treetag()
matching.lang <- function(lang, lang.preset){
  if(!identical(lang, lang.preset)){
    warning(
      "Language \"",lang,"\" doesn't match the preset \"", lang.preset,"\". If you run into errors, you have been warned!",
      call.=FALSE
    )
  } else {}
}
## end function matching.lang()


## function paste.tokenized.text()
paste.tokenized.text <- function(txt){
  # put all text together
  all.text <- paste(txt, collapse=" ")
  # remove superfluous spaces
  all.text <- gsub("([[:space:]]{1})([\\(\\[\\{])([[:space:]]{1}|$)", " \\2", all.text, perl=TRUE)
  all.text <- gsub("([[:space:]]{1})([\\)\\]\\}])([[:space:]]{1}|$)", "\\2 ", all.text, perl=TRUE)
  all.text <- gsub("([[:space:]]{1})([,;.:])([[:space:]]{1}|$)", "\\2 ", all.text, perl=TRUE)
}
## end function paste.tokenized.text()


## function checkLangPreset()
# checks if a given language preset is defined at all, and either returns TRUE/error or the full preset definition
checkLangPreset <- function(preset, returnPresetDefinition=TRUE){
  # koRpus dropped support for non-UTF-8 presets and renamed former presets omitting the "-utf8" suffix
  # to not break compatibility, we'll just gracefully remove the suffix
  if(grepl("utf8", preset)){
    preset <- gsub("-utf8$", "", preset)
    warning(paste0("UTF-8 is now the default encoding, please rename your preset from \"", preset, "-utf8\" into just \"", preset, "\"!"), call.=FALSE)
  } else {}
  preset_definition <- as.list(as.environment(.koRpus.env))[["langSup"]][["treetag"]][["presets"]][[preset]]
  if(is.null(preset_definition)){
    error_prefix <- "Manual TreeTagger configuration:\n  "
    # is a language support package installed but not loaded?
    # this check could be limited to two letter patterns,
    # but let's keep it open for now and see how it works out for the users
    lang_pckg_name <- paste0("koRpus.lang.", preset)
    preset_status <- check_lang_packages(available=TRUE, pattern=paste0("^", lang_pckg_name))
    if(lang_pckg_name %in% names(preset_status)){
      lang_pckg <- preset_status[[lang_pckg_name]]
      if(isTRUE(lang_pckg[["loaded"]])){
        stop(simpleError(paste0(error_prefix, "There appears to be a loaded language package \"", lang_pckg_name, "\",\n  yet the preset \"", preset, "\" is not available. Please check your installation!")))
      } else if(isTRUE(lang_pckg[["installed"]])){
        stop(simpleError(paste0(error_prefix, "The preset \"", preset, "\" is currently not available.\n  Did you forget to load the respective language package?\n  Try library(\"", lang_pckg_name, "\") instead of library(\"koRpus\").")))
      } else if(isTRUE(lang_pckg[["available"]])){
        stop(simpleError(paste0(error_prefix, "The preset \"", preset, "\" is currently not available.\n  Did you forget to install and load the respective language package?\n  Try install.koRpus.lang(\"", preset, "\") and then library(\"", lang_pckg_name, "\") instead of library(\"koRpus\").")))
      } else {
        stop(simpleError(paste0(error_prefix, "\"", preset, "\" is not a valid preset!")))
      }
    } else {
      stop(simpleError(paste0(error_prefix, "\"", preset, "\" is not a valid preset!")))
    }
  } else {
    if(isTRUE(returnPresetDefinition)){
      return(preset_definition)
    } else {
      return(TRUE)
    }
  }
}
## end function checkLangPreset()


## function checkTTOptions()
# this helper function does some basic validity checks on provided TT.options
# if all goes well, returns a named list with valid settings
# if manual.config=FALSE returns an empty list because options are omitted anyway
checkTTOptions <- function(TT.options, manual.config, TT.tknz=TRUE){
  result <- list()
  if(!is.null(TT.options) & !is.list(TT.options)){
    warning("You provided \"TT.options\", but not as a list!")
  } else {}
  optNames <- names(TT.options)

  if(isTRUE(manual.config)){
    # basic check for valid element names
    validOptions <- c(
      "path",
      "preset",
      "tokenizer",
      "tknz.opts",
      "pre.tagger",
      "tagger",
      "abbrev",
      "params",
      "lexicon",
      "lookup",
      "filter",
      "no.unknown",
      "splitter",
      "splitter.opts"
    )
    undefined.options <- !optNames %in% validOptions
    if(any(undefined.options)){
      stop(simpleError(paste0(
        "You used undefined names in TT.options:\n  \"",
        paste0(optNames[undefined.options], collapse="\", \""),
        "\""
      )))
    } else {}

    if(!"path" %in% optNames){
      stop(simpleError("Manual TreeTagger configuration demanded, but not even a path was defined!"))
    } else {
      # specify basic paths
      result[["TT.path"]] <- TT.options[["path"]]
      result[["TT.bin"]] <- file.path(result[["TT.path"]],"bin")
      result[["TT.cmd"]] <- file.path(result[["TT.path"]],"cmd")
      result[["TT.lib"]] <- file.path(result[["TT.path"]],"lib")
      # check if this is really a TreeTagger root directory
      sapply(c(result[["TT.bin"]], result[["TT.cmd"]], result[["TT.lib"]]), function(chk.dir){check.file(chk.dir, mode="dir")})
    }

    # basic options, cannot be toyed with
    result[["TT.opts"]] <- "-token -lemma -sgml -pt-with-lemma -quiet"
    # allow some dedicated options to be set without jeopardizing the output format
    if(!is.null(TT.options[["no.unknown"]])){
      result[["TT.opts"]] <- ifelse(
        isTRUE(TT.options[["no.unknown"]]),
        paste0(result[["TT.opts"]], " -no-unknown"),
        result[["TT.opts"]]
      )
    } else {}

    if(!is.null(TT.options[["preset"]])){
      result[["preset"]] <- checkLangPreset(preset=TT.options[["preset"]])
      if(isTRUE(grepl("tree-tagger-[[:alpha:].]+$|tag-[[:alpha:].]+$|*.bat$", tolower(TT.options[["tagger"]])))){
        # sometimes users try to combine TreeTagger's batch files with presets, which is doomed to fail
        stop(simpleError("If you're using a language preset, you must not set TreeTagger's batch files as 'tagger'! "))
      } else {}
    } else {
      # if no preset was defined, we need some more information
      if(isTRUE(TT.tknz)){
        needed.options <- c("tokenizer", "tagger", "params")
      } else {
        needed.options <- c("tagger", "params")
      }
      missing.options <- !needed.options %in% optNames
      if(any(missing.options)){
        stop(simpleError(paste0(
          "Manual TreeTagger configuration demanded, but not enough optinons given!\n  Missing options: \"",
          paste0(needed.options[missing.options], collapse="\", \""),
          "\""
        )))
      } else {}
    }
  } else {}

  return(result)
} ## end function checkTTOptions()


## function winPath()
# all of sudden, the constructions of file paths stopped working for some windows users,
# so we're forced to do something really, really ugly and replace all R-like "/"
# file separators with the windows-like "\\" manually.
winPath <- function(path){
  return(gsub("/", "\\\\", path))
}
# just for the record: i really *hate* windows!
## end function winPath()


## function check_lang_packages()
# checks what koRpus.lang.* packages are currently installed or loaded
# returns a named list with a list for each installed package, providing
# entries named "available", "installed", "loaded", and "title"
# availabe: also check for all available packages in 'repos'
# available.only: omit all installed packages which cannot be found in 'repos'
#' @importFrom utils available.packages packageDescription
check_lang_packages <- function(
  available=FALSE,
  repos="https://undocumeantit.github.io/repos/l10n/",
  available.only=FALSE,
  pattern="^koRpus.lang.*"
){
  ### this function should be kept close to identical to the respective function
  ### in the 'sylly' package, except for the pattern
  result <- list()
  if(isTRUE(available)){
    available_packages <- utils::available.packages(repos=repos)
    available_koRpus_lang <- grepl(pattern, available_packages[,"Package"])
    supported_lang <- unique(available_packages[available_koRpus_lang,"Package"])
  } else {
    available_koRpus_lang <- FALSE
    supported_lang <- NULL
  }

  loaded_packages <- loadedNamespaces()
  loaded_koRpus_lang <- grepl(pattern, loaded_packages)
  installed_packages <- unique(dir(.libPaths()))
  installed_koRpus_lang <- grepl(pattern, installed_packages)

  have_koRpus_lang <- any(installed_koRpus_lang, available_koRpus_lang)

  if(isTRUE(have_koRpus_lang)){
    if(isTRUE(available.only)){
      all_packages <- supported_lang
    } else {
      all_packages <- unique(c(installed_packages[installed_koRpus_lang], supported_lang))
    }
    for (this_package in all_packages){
      result[[this_package]] <- list(available=NA, installed=FALSE, loaded=FALSE, title="(unknown)")
      if(all(isTRUE(available), this_package %in% supported_lang)){
        result[[this_package]][["available"]] <- TRUE
      } else {}
      if(this_package %in% unique(installed_packages[installed_koRpus_lang])){
        result[[this_package]][["installed"]] <- TRUE
        this_package_index <- which.min(!installed_packages %in% this_package)
        result[[this_package]][["title"]] <- utils::packageDescription(installed_packages[this_package_index])[["Title"]]
      } else {}
      if(this_package %in% unique(loaded_packages[loaded_koRpus_lang])){
        result[[this_package]][["loaded"]] <- TRUE
      } else {}
    }
  } else {}
  
  return(result)
} ## end function check_lang_packages()


## function check_toggle_utf8()
# used in treetag() to work around inconsistent naming of parameter
# and other files over the releases; some have "utf8-" or "-utf8" in their name,
# both earlier and later versions don't. some use the ".txt" extension, later versions
# don't. so we'll first try with these pre- and suffixes, then without, and use
# whatever is found first. returns either a validated path or throws an error.
#
# - file_utf8: file to check for existance, the variant including "utf8" in its name
# - dir: full path to expected file (directory only)
# - optional: if TRUE doesn't return an error if no file was found return the path as given;
#     further checks are done elsewhere, we're not handling missing files here, just
#     possible alternatives to defaults!
check_toggle_utf8 <- function(file_utf8, dir=NA, optional=FALSE){
  if(any(is.null(file_utf8), is.na(file_utf8))){
    return(file_utf8)
  } else {
    if(is.na(dir)){
      dir <- dirname(normalizePath(file_utf8, mustWork=FALSE))
      file_utf8 <- basename(file_utf8)
    } else {
      dir <- normalizePath(dir, mustWork=FALSE)
    }

    check.file(filename=dir, mode="dir", stopOnFail=TRUE)

    if(isTRUE(grepl("utf8", file_utf8))){
      all_possible_files <- c(file_utf8, gsub("-utf8", "", gsub("^utf8-", "", file_utf8)))
    } else {
      all_possible_files <- file_utf8
    }
    if(isTRUE(grepl("\\.txt$", file_utf8))){
      all_possible_files <- c(all_possible_files, gsub("\\.txt$", "", all_possible_files))
    } else {}

    all_possible_files <- file.path(dir, all_possible_files)
    all_files_found <- sapply(
      all_possible_files,
      check.file,
      mode="exist",
      stopOnFail=FALSE
    )

    if(any(all_files_found)){
      return(all_possible_files[all_files_found][1])
    } else {
      if(isTRUE(optional)){
        return(file_utf8)
      } else {
        stop(simpleError(
          paste0(
            "None of the following files were found, please check your TreeTagger installation!\n ",
            paste0(all_possible_files, collapse="\n ")
          )
        ))
      }
    }
  }
} ## end function check_toggle_utf8()


## function check_doc_id()
check_doc_id <- function(
    doc_id,
    default,
    quiet=TRUE
){
    warn <- FALSE
    warn_class <- "unknown"
    if(missing(doc_id)){
        warn <- TRUE
        warn_class <- "missing"
        doc_id <- NA
    } else {}
    if(!is.character(doc_id)){
        if(!isTRUE(warn)){
            warn <- TRUE
            warn_class <- class(doc_id)
        } else {}
        if(any(is.null(doc_id), is.logical(doc_id))){
            if(missing(default)){
                stop(simpleError("No \"doc_id\" given and no default value found!"))
            } else {}
            doc_id <- gsub("[^[:alnum:]_\\\\.-]+", "", default)
        } else {
            doc_id <- as.character(doc_id)
        }
    } else {}
    if(all(!isTRUE(quiet), warn)){
        warning(paste0("Fixed \"doc_id\", was ", warn_class, "!"), call.=FALSE)
    } else {}
    return(doc_id)
} ## function check_doc_id()
unDocUMeantIt/koRpus documentation built on May 21, 2021, 9:26 p.m.