R/02_method_treetag.R

# Copyright 2010-2021 Meik Michalke <meik.michalke@hhu.de>
#
# This file is part of the R package koRpus.
#
# koRpus is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# koRpus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with koRpus.  If not, see <http://www.gnu.org/licenses/>.


#' A method to call TreeTagger
#'
#' This method calls a local installation of TreeTagger[1] to tokenize and POS tag the given text.
#'
#' Note that the value of \code{lang} must match a valid language supported by \code{\link[koRpus:kRp.POS.tags]{kRp.POS.tags}}.
#' It will also get stored in the resulting object and might be used by other functions at a later point.
#' E.g., \code{treetag} is being called by \code{\link[koRpus:freq.analysis]{freq.analysis}}, which
#' will by default query this language definition, unless explicitly told otherwise. The rationale behind this
#' is to comfortably make it possible to have tokenized and POS tagged objects of various languages around
#' in your workspace, and not worry about that too much.
#'
#' @param file Either a connection or a character vector, valid path to a file, containing the text to be analyzed.
#'    If \code{file} is a connection, its contents will be written to a temporary file, since TreeTagger can't read from
#'    R connection objects.
#' @param treetagger A character vector giving the TreeTagger script to be called. If set to \code{"kRp.env"} this is got from \code{\link[koRpus:get.kRp.env]{get.kRp.env}}.
#'    Only if set to \code{"manual"}, it is assumend not to be a wrapper script that can work the given text file, but that you would like
#'    to manually tweak options for tokenizing and POS tagging yourself. In that case, you need to provide a full set of options with the \code{TT.options}
#'    parameter.
#' @param rm.sgml Logical, whether SGML tags should be ignored and removed from output
#' @param lang A character string naming the language of the analyzed corpus. See \code{\link[koRpus:kRp.POS.tags]{kRp.POS.tags}} and
#'    \code{\link[koRpus:available.koRpus.lang]{available.koRpus.lang}}for all supported languages. If set to \code{"kRp.env"} this is
#'    fetched from \code{\link[koRpus:get.kRp.env]{get.kRp.env}}.
#' @param apply.sentc.end Logical, whethter the tokens defined in \code{sentc.end} should be searched and set to a sentence ending tag.
#' @param sentc.end A character vector with tokens indicating a sentence ending. This adds to TreeTaggers results, it doesn't really replace them.
#' @param encoding A character string defining the character encoding of the input file, like  \code{"Latin1"} or \code{"UTF-8"}. If \code{NULL},
#'    the encoding will either be taken from a preset (if defined in \code{TT.options}), or fall back to \code{""}. Hence you can overwrite the preset encoding with this parameter.
#' @param TT.options A list of options to configure how TreeTagger is called. You have two basic choices: Either you choose one of the pre-defined presets
#'    or you give a full set of valid options:
#'    \itemize{
#'      \item {\code{path}} {Mandatory: The absolute path to the TreeTagger root directory. That is where its subfolders \code{bin}, \code{cmd} and \code{lib} are located.}
#'      \item {\code{preset}} {Optional: If you choose one of the pre-defined presets of one of the available language packages (like \code{"de"} for German, see
#'        \code{\link[koRpus:available.koRpus.lang]{available.koRpus.lang}} for details),
#'        you can omit all the following elements, because they will be filled with defaults. Of course this only makes sense if you have a
#'        working default installation. Note that since koRpus 0.07-1, UTF-8 is the global default encoding.}
#'      \item {\code{tokenizer}} {Mandatory: A character string, naming the tokenizer to be called. Interpreted relative to \code{path/cmd/}.}
#'      \item {\code{tknz.opts}} {Optional: A character string with the options to hand over to the tokenizer. You don't need to specify "-a"
#'        if \code{abbrev} is given. If \code{TT.tknz=FALSE}, you can pass configurational options to \code{\link[koRpus:tokenize]{tokenize}}
#'        by provinding them as a named list (instead of a character string) here.}
#'      \item {\code{pre.tagger}} {Optional: A character string with code to be run before the tagger. This code is used as-is, so you need
#'        make sure it includes the needed pipe symbols.}
#'      \item {\code{tagger}} {Mandatory: A character string, naming the tagger-command to be called. Interpreted relative to \code{path/bin/}.}
#'      \item {\code{abbrev}} {Optional: A character string, naming the abbreviation list to be used. Interpreted relative to \code{path/lib/}.}
#'      \item {\code{params}} {Mandatory: A character string, naming the parameter file to be used. Interpreted relative to \code{path/lib/}.}
#'      \item {\code{lexicon}} {Optional: A character string, naming the lexicon file to be used. Interpreted relative to \code{path/lib/}.}
#'      \item {\code{lookup}} {Optional: A character string, naming the lexicon lookup command. Interpreted relative to \code{path/cmd/}.}
#'      \item {\code{filter}} {Optional: A character string, naming the output filter to be used. Interpreted relative to \code{path/cmd/}.}
#'      \item {\code{no.unknown}} {Optional: Logical, can be used to toggle the \code{"-no-unknown"} option of TreeTagger (defaults to \code{FALSE}).}
#'      \item {\code{splitter}} {Optional: A character string, naming the splitter to be called (before the tokenizer). Interpreted relative to \code{path/cmd/}.}
#'      \item {\code{splitter.opts}} {Optional: A character string with the options to hand over to the splitter.}
#'    }
#' You can also set these options globally using \code{\link[koRpus:set.kRp.env]{set.kRp.env}},
#' and then force \code{treetag} to use them by setting \code{TT.options="kRp.env"} here. Note: 
#' If you use the \code{treetagger} setting from kRp.env and it's set to \code{TT.cmd="manual"},
#' \code{treetag} will treat \code{TT.options=NULL} like \code{TT.options="kRp.env"} 
#' automatically.
#' @param debug Logical. Especially in cases where the presets wouldn't work as expected, this switch can be used to examine the values \code{treetag}
#'    is assuming.
#' @param TT.tknz Logical, if \code{FALSE} TreeTagger's tokenzier script will be replaced by \code{koRpus}' function \code{\link[koRpus:tokenize]{tokenize}}.
#'    To accomplish this, its results will be written to a temporal file which is automatically deleted afterwards (if \code{debug=FALSE}). Note that
#'    this option only has an effect if \code{treetagger="manual"}.
#' @param format Either "file" or "obj", depending on whether you want to scan files or analyze the text in a given object, like
#'    a character vector. If the latter, it will be written to a temporary file (see \code{file}).
#' @param stopwords A character vector to be used for stopword detection. Comparison is done in lower case. You can also simply set 
#'    \code{stopwords=tm::stopwords("en")} to use the english stopwords provided by the \code{tm} package.
#' @param stemmer A function or method to perform stemming. For instance, you can set \code{SnowballC::wordStem} if you have
#'    the \code{SnowballC} package installed. As of now, you cannot provide further arguments to this function.
#' @param doc_id Character string, optional identifier of the particular document. Will be added to the \code{desc} slot, and as a factor to the \code{"doc_id"} column
#'    of the \code{tokens} slot. If \code{NA}, the document name will be used (for \code{format="obj"} a random name).
#' @param add.desc Logical. If \code{TRUE}, the tag description (column \code{"desc"} of the data.frame) will be added directly
#'    to the resulting object. If set to \code{"kRp.env"} this is fetched from \code{\link[koRpus:get.kRp.env]{get.kRp.env}}.
#' @param ... Only used for the method generic.
#' @return An object of class \code{\link[koRpus:kRp.text-class]{kRp.text}}. If \code{debug=TRUE}, prints internal variable settings and attempts to return the
#'    original output if the TreeTagger system call in a matrix.
#' @author m.eik michalke \email{meik.michalke@@hhu.de}, support for various laguages was contributed by Earl Brown (Spanish), Alberto Mirisola (Italian) and
#'    Alexandre Brulet (French).
#' @keywords misc
#' @seealso \code{\link[koRpus:freq.analysis]{freq.analysis}}, \code{\link[koRpus:get.kRp.env]{get.kRp.env}},
#' \code{\link[koRpus:kRp.text-class]{kRp.text}}
#' @references
#' Schmid, H. (1994). Probabilistic part-of-speec tagging using decision trees. In
#'    \emph{International Conference on New Methods in Language Processing}, Manchester, UK, 44--49.
#'
#' [1] \url{https://www.cis.lmu.de/~schmid/tools/TreeTagger/}
#' @export
#' @import methods
#' @docType methods
#' @rdname treetag-methods
#' @example inst/examples/define_sample_file.R
#' @examples
#' \dontrun{
#' # first way to invoke POS tagging, using a built-in preset:
#' tagged.results <- treetag(
#'   sample_file,
#'   treetagger="manual",
#'   lang="en",
#'   TT.options=list(
#'     path=file.path("~","bin","treetagger"),
#'     preset="en"
#'   )
#' )
#' # second way, use one of the batch scripts that come with TreeTagger:
#' tagged.results <- treetag(
#'   sample_file,
#'   treetagger=file.path("~","bin","treetagger","cmd","tree-tagger-english"),
#'   lang="en"
#' )
#' # third option, set the above batch script in an environment object first:
#' set.kRp.env(
#'   TT.cmd=file.path("~","bin","treetagger","cmd","tree-tagger-english"),
#'   lang="en"
#' )
#' tagged.results <- treetag(
#'   sample_file
#' )
#'
#' # after tagging, use the resulting object with other functions in this package:
#' readability(tagged.results)
#' lex.div(tagged.results)
#' 
#' ## enabling stopword detection and stemming
#' # if you also installed the packages tm and SnowballC,
#' # you can use some of their features with koRpus:
#' set.kRp.env(
#'   TT.cmd="manual",
#'   lang="en",
#'   TT.options=list(
#'     path=file.path("~","bin","treetagger"),
#'     preset="en"
#'   )
#' )
#' tagged.results <- treetag(
#'   sample_file,
#'   stopwords=tm::stopwords("en"),
#'   stemmer=SnowballC::wordStem
#' )
#'
#' # removing all stopwords now is simple:
#' tagged.noStopWords <- filterByClass(
#'   tagged.results,
#'   "stopword"
#' )
#' }
setGeneric(
  "treetag",
  def=function(
    file,
    treetagger="kRp.env",
    rm.sgml=TRUE,
    lang="kRp.env",
    apply.sentc.end=TRUE,
    sentc.end=c(".","!","?",";",":"),
    encoding=NULL,
    TT.options=NULL,
    debug=FALSE,
    TT.tknz=TRUE,
    format="file",
    stopwords=NULL,
    stemmer=NULL,
    doc_id=NA,
    add.desc="kRp.env",
    ...
  ){
    standardGeneric("treetag")
  },
  valueClass=c("kRp.text","matrix")
)

#' @export
#' @docType methods
#' @rdname treetag-methods
#' @aliases treetag,character-method
setMethod("treetag",
  signature(file="character"),
  function(
    file,
    treetagger="kRp.env",
    rm.sgml=TRUE,
    lang="kRp.env",
    apply.sentc.end=TRUE,
    sentc.end=c(".","!","?",";",":"),
    encoding=NULL,
    TT.options=NULL,
    debug=FALSE,
    TT.tknz=TRUE,
    format="file",
    stopwords=NULL,
    stemmer=NULL,
    doc_id=NA,
    add.desc="kRp.env"
  ){

    # TreeTagger uses slightly different presets on windows and unix machines,
    # so we'll need to check the OS first
    if(identical(base::.Platform[["OS.type"]], "windows")){
      unix.OS <- FALSE
    } else {
      unix.OS <- TRUE
    }

    # check on TT options
    if(identical(treetagger, "kRp.env")){
      treetagger <- get.kRp.env(TT.cmd=TRUE)
      if(all(is.null(TT.options), identical(treetagger, "manual"))){
        TT.options <- get.kRp.env(TT.options=TRUE)
      } else {}
    } else {}
    if(identical(TT.options, "kRp.env")){
      TT.options <- get.kRp.env(TT.options=TRUE)
    } else if(!is.null(TT.options) && !is.list(TT.options)){
      warning("You provided \"TT.options\", but not as a list!")
    } else {}

    if(identical(lang, "kRp.env")){
      lang <- get.kRp.env(lang=TRUE)
    } else {}
    if(identical(add.desc, "kRp.env")){
      add.desc <- get.kRp.env(add.desc=TRUE)
    } else {}

    if(identical(treetagger, "tokenize")){
      stop(simpleError("Sorry, you can't use treetag() and tokenize() at the same time!"))
    } else {}

    # TreeTagger won't be able to use a connection object, so to make these usable,
    # we have to write its content to a temporary file first
    if(identical(format, "obj")){
      takeAsFile <- dumpTextToTempfile(text=file, encoding=encoding)
      if(!isTRUE(debug)){
        on.exit(unlink(takeAsFile), add=TRUE)
      } else {}
    } else {
      # does the text file exist?
      takeAsFile <- normalizePath(file)
    }
    check.file(takeAsFile, mode="exist")

    doc_id <- check_doc_id(doc_id=doc_id, default=basename(takeAsFile))

    # TODO: move TT.options checks to internal function to call it here
    manual.config <- identical(treetagger, "manual")
    checkedOptions <- checkTTOptions(TT.options=TT.options, manual.config=manual.config, TT.tknz=TT.tknz)

    if(isTRUE(manual.config)){
      # specify basic paths
      TT.path <- checkedOptions[["TT.path"]]
      TT.bin <- checkedOptions[["TT.bin"]]
      TT.cmd <- checkedOptions[["TT.cmd"]]
      TT.lib <- checkedOptions[["TT.lib"]]

      # basic options
      TT.opts <- checkedOptions[["TT.opts"]]
      TT.extra.opts <- ""

      in.TT.options <- names(TT.options)

      have.preset <- use.splitter <- FALSE
      if(any(in.TT.options == "preset")){
        ## minimum requirements for new presets:
        #  TT.splitter        <- c() # done before tokenization
        #  TT.tokenizer       <- file.path(TT.cmd, "...")
        #  TT.tagger          <- file.path(TT.bin, "...")
        #  TT.abbrev          <- c()
        #  TT.params          <- file.path(TT.lib, "...")
        #  TT.tknz.opts       <- c()
        #  TT.lookup.command  <- c()
        #  TT.filter.command  <- c()
        preset.definition <- checkedOptions[["preset"]]
        # check for matching language definitions
        matching.lang(lang=lang, lang.preset=preset.definition[["lang"]])
        preset.list <- preset.definition[["preset"]](TT.cmd=TT.cmd, TT.bin=TT.bin, TT.lib=TT.lib, unix.OS=unix.OS)

        TT.splitter        <- preset.list[["TT.splitter"]]
        TT.splitter.opts   <- preset.list[["TT.splitter.opts"]]
        TT.tokenizer       <- preset.list[["TT.tokenizer"]]
        TT.pre.tagger      <- preset.list[["TT.pre.tagger"]]
        TT.tagger          <- preset.list[["TT.tagger"]]
        TT.abbrev          <- preset.list[["TT.abbrev"]]
        TT.params          <- preset.list[["TT.params"]]
        TT.lexicon         <- preset.list[["TT.lexicon"]]
        TT.lookup          <- preset.list[["TT.lookup"]]
        TT.filter          <- preset.list[["TT.filter"]]

        TT.tknz.opts       <- preset.list[["TT.tknz.opts"]]
        TT.lookup.command  <- preset.list[["TT.lookup.command"]]
        TT.filter.command  <- preset.list[["TT.filter.command"]]
        have.preset <- TRUE
      } else {}

      if(any(in.TT.options == "tokenizer")){
        TT.tokenizer    <- check_toggle_utf8(file_utf8=TT.options[["tokenizer"]], dir=TT.cmd)
      } else {
        TT.tokenizer    <- check_toggle_utf8(file_utf8=TT.tokenizer)
      }
      if(any(in.TT.options == "tagger")){
        TT.tagger      <- file.path(TT.bin, TT.options[["tagger"]])
      } else {}
      # check if path works
      check.file(TT.tagger, mode="exist")
      if(any(in.TT.options == "pre.tagger")){
        TT.pre.tagger  <- TT.options[["pre.tagger"]]
      } else {
        if(!isTRUE(have.preset)){
          TT.pre.tagger <- c()
        } else {}
      }
      if(any(in.TT.options == "params")){
        TT.params      <- check_toggle_utf8(file_utf8=TT.options[["params"]], dir=TT.lib)
      } else {
        TT.params      <- check_toggle_utf8(file_utf8=TT.params)
      }
      if(any(in.TT.options == "lexicon")){
        TT.lexicon     <- check_toggle_utf8(file_utf8=TT.options[["lexicon"]], dir=TT.lib, optional=TRUE)
      } else {
        TT.lexicon     <- check_toggle_utf8(file_utf8=TT.lexicon, optional=TRUE)
      }

      # check the input encoding
      input.enc <- ifelse(
        is.null(encoding),
          ifelse(
            all(identical(treetagger, "manual"), any(in.TT.options == "preset")),
              preset.definition[["encoding"]],
              ""),
          encoding)

      if(any(in.TT.options == "tknz.opts")){
        TT.tknz.opts    <- TT.options[["tknz.opts"]]
      } else {
        if(!isTRUE(have.preset)){
          TT.tknz.opts  <- c()
        } else {}
      }

      if(any(in.TT.options == "splitter")){
        TT.splitter     <- TT.options[["splitter"]]
      } else {
        if(!isTRUE(have.preset)){
          TT.splitter   <- c()
        } else {}
      }
      if(any(in.TT.options == "splitter.opts")){
        TT.splitter.opts <- TT.options[["splitter.opts"]]
      } else {
        if(!isTRUE(have.preset)){
          TT.splitter.opts <- c()
        } else {}
      }
      if(all(!identical(TT.splitter, ""), !identical(TT.splitter, c()), !is.null(TT.splitter))){
        # check if path works
        check.file(TT.splitter, mode="exist")
        use.splitter <- TRUE
      }

      if(any(in.TT.options == "abbrev")){
        TT.abbrev      <- check_toggle_utf8(file_utf8=TT.options[["abbrev"]], dir=TT.lib)
        TT.tknz.opts   <- paste(TT.tknz.opts, "-a", TT.abbrev)
      } else {
        if(all(isTRUE(have.preset), !identical(TT.abbrev, c()))){
          TT.abbrev    <- check_toggle_utf8(file_utf8=TT.abbrev)
          TT.tknz.opts <- paste(TT.tknz.opts, "-a", TT.abbrev)
        } else {
          TT.abbrev    <- eval(formals(tokenize)[["abbrev"]])
        }
      }

      ## probably replacing the tokenizer
      if(!isTRUE(TT.tknz)){
        if(!is.list(TT.tknz.opts)){
          TT.tknz.opts <- list()
        } else {}
        given.tknz.options <- names(TT.tknz.opts)
        tokenize.options <- c("split", "ign.comp", "heuristics", "heur.fix",
          "sentc.end", "detect", "clean.raw", "perl", "stopwords", "stemmer")
        for (this.opt in tokenize.options){
          if(!any(given.tknz.options == this.opt)) {
            TT.tknz.opts[[this.opt]] <- eval(formals(tokenize)[[this.opt]])
            if(isTRUE(debug)){
              message(paste0("        ", this.opt, "=", paste0(TT.tknz.opts[[this.opt]], collapse=", ")))
            } else {}
          } else {}
        }
        if(!any(given.tknz.options == "abbrev")){
          TT.tknz.opts[["abbrev"]] <- TT.abbrev
        } else {}

        # set this just for the debug printout
        TT.tokenizer  <- "koRpus::tokenize()"
        # call tokenize() and write results to tempfile
        tknz.tempfile <- tempfile(pattern="tokenize", fileext=".txt")
        tknz.results <- tokenize(
          takeAsFile,
          format="file",
          fileEncoding=input.enc,
          split=TT.tknz.opts[["split"]],
          ign.comp=TT.tknz.opts[["ign.comp"]],
          heuristics=TT.tknz.opts[["heuristics"]],
          heur.fix=TT.tknz.opts[["heur.fix"]],
          abbrev=TT.tknz.opts[["abbrev"]],
          tag=FALSE,
          lang=lang,
          sentc.end=TT.tknz.opts[["sentc.end"]],
          detect=TT.tknz.opts[["detect"]],
          clean.raw=TT.tknz.opts[["clean.raw"]],
          perl=TT.tknz.opts[["perl"]],
          stopwords=TT.tknz.opts[["stopwords"]],
          stemmer=TT.tknz.opts[["stemmer"]]
        )
        # TreeTagger can produce mixed encoded results if fed with UTF-8 in Latin1 mode
        tknz.results <- iconv(tknz.results, from="UTF-8", to=input.enc)
        on.exit(message(paste0("Assuming '", input.enc, "' as encoding for the input file. If the results turn out to be erroneous, check the file for invalid characters, e.g. em.dashes or fancy quotes, and/or consider setting 'encoding' manually.")))
        cat(paste(tknz.results, collapse="\n"), "\n", file=tknz.tempfile, sep="")
        if(!isTRUE(debug)){
          on.exit(unlink(tknz.tempfile), add=TRUE)
        } else {}
      } else {}

      if(any(in.TT.options == "lexicon")){
        if(all(!isTRUE(have.preset), !any(in.TT.options == "lookup"))){
          TT.lookup.command  <- c()
          warning("Manual TreeTagger configuration: Defined a \"lexicon\" without a \"lookup\" command, hence omitted!")
        } else {
          if(any(in.TT.options == "lookup")){
            TT.lookup    <- file.path(TT.cmd, TT.options[["lookup"]])
          } else {}
          TT.lexicon      <- file.path(TT.lib, TT.options[["lexicon"]])
          check.file(TT.lookup, mode="exist")
          lexiconExists <- check.file(TT.lexicon, mode="exist", stopOnFail=FALSE)
          if(isTRUE(lexiconExists)){
            if(any(identical(TT.lookup, ""), identical(TT.lookup, c()))){
              TT.extra.opts <- paste0(TT.extra.opts, " -lex ", TT.lexicon)
            } else {
              TT.lookup.command  <- paste(TT.lookup, TT.lexicon, "|")
            }
          } else {
            TT.lookup.command  <- c()
            warning(paste0("Can't find the lexicon file, hence omitted! Please ensure this path is valid:\n  ", TT.lexicon), call.=FALSE)
          }
        }
      } else {
        if(!isTRUE(have.preset)){
          TT.lookup.command  <- c()
        } else {
          if(!identical(TT.lookup.command, c())){
            check.file(TT.lookup, mode="exist")
            lexiconExists <- check.file(TT.lexicon, mode="exist", stopOnFail=FALSE)
            if(!isTRUE(lexiconExists)){
              TT.lookup.command  <- c()
              warning(paste0("Can't find the lexicon file, hence omitted! Please ensure this path is valid:\n  ", TT.lexicon), call.=FALSE)
            } else {}
          } else {}
        }
      }

      if(any(in.TT.options == "filter")){
        TT.filter      <- file.path(TT.cmd, TT.options[["filter"]])
        TT.filter.command  <- paste("|", TT.filter)
      } else {
        if(!isTRUE(have.preset)){
          TT.filter.command  <- c()
        } else {}
      }

      # create system call for unix and windows
      if(isTRUE(unix.OS)){
        if(isTRUE(TT.tknz)){
          TT.call.file <- paste0("\"", takeAsFile, "\"")
          if(isTRUE(use.splitter)){
            TT.splitter <- paste(TT.splitter, TT.call.file, TT.splitter.opts)
            TT.call.file <- ""
          } else {}
          sys.tt.call <- paste(TT.splitter, TT.tokenizer, TT.tknz.opts, TT.call.file, "|",
            TT.lookup.command, TT.pre.tagger, TT.tagger, TT.opts, TT.extra.opts, TT.params, TT.filter.command)
        } else {
          sys.tt.call <- paste("cat ", tknz.tempfile, "|",
            TT.lookup.command, TT.pre.tagger, TT.tagger, TT.opts, TT.extra.opts, TT.params, TT.filter.command)
        }
      } else {
        if(isTRUE(TT.tknz)){
          TT.call.file <- winPath(paste0("\"", takeAsFile, "\""))
          if(isTRUE(use.splitter)){
            TT.splitter <- paste(winPath(TT.splitter), TT.call.file, TT.splitter.opts)
            TT.call.file <- ""
          } else {}
          sys.tt.call <- paste(TT.splitter, "perl ", winPath(TT.tokenizer), TT.tknz.opts, TT.call.file, "|",
            winPath(TT.lookup.command), TT.pre.tagger, winPath(TT.tagger), winPath(TT.params), TT.opts, TT.extra.opts, TT.filter.command)
        } else {
          sys.tt.call <- paste("type ", winPath(tknz.tempfile), "|",
            winPath(TT.lookup.command), TT.pre.tagger, winPath(TT.tagger), winPath(TT.params), TT.opts, TT.extra.opts, TT.filter.command)
        }
      }

    } else {
      input.enc <- ifelse(
        is.null(encoding),
          "",
          encoding)

      check.file(treetagger, mode="exec")

      sys.tt.call <- paste(treetagger, takeAsFile)
    }

    ## uncomment for debugging
    if(isTRUE(debug)){
      if(isTRUE(manual.config)){
        message(paste(
          if(isTRUE(use.splitter)){
          paste("
          TT.splitter: ", TT.splitter,"
          TT.splitter.opts: ", TT.splitter.opts)
          },"
          TT.tokenizer: ",TT.tokenizer,
          ifelse(isTRUE(TT.tknz),
            paste0("\n          TT.tknz.opts: ",TT.tknz.opts),
            paste0("\n          tempfile: ",tknz.tempfile)),"
          file: ",takeAsFile,"
          TT.lookup.command: ",TT.lookup.command,"
          TT.pre.tagger: ", TT.pre.tagger,"
          TT.tagger: ",TT.tagger,"
          TT.opts: ",TT.opts,"
          TT.extra.opts: ", TT.extra.opts,"
          TT.params: ",TT.params,"
          TT.filter.command: ",TT.filter.command,"\n
          sys.tt.call: ",sys.tt.call,"\n"))
      } else {
        message(paste("
          file: ",takeAsFile,"
          sys.tt.call: ",sys.tt.call,"\n"))
      }
    } else {}

    ## do the system call
    if(isTRUE(unix.OS)){
      tagged.text <- system(sys.tt.call, ignore.stderr=TRUE, intern=TRUE)
    } else {
      tagged.text <- shell(sys.tt.call, translate=FALSE, ignore.stderr=TRUE, intern=TRUE)
    }
    # TreeTagger should return UTF-8; explicitly declare it so there's no doubt about it
    Encoding(tagged.text) <- "UTF-8"

    ## workaround
    # in seldom cases TreeTagger seems to return duplicate tab stops
    # we'll try to correct for that here
    if(any(grepl("\t\t", tagged.text))){
      tagged.text <- gsub("\t\t", "\t", tagged.text)
    } else {}

    if(isTRUE(rm.sgml)){
      tagged.text <- tagged.text[grep("^[^<]", tagged.text)]
    } else {}

    ## try to catch error in local TreeTagger setup
    # when TreeTagger is not set up correctly, the system call will not fail loudly
    # but simply not return any useful data. this in turn will definitely cause
    # treetag() to fail with an error. we'll make it obvious that probably not
    # not koRpus is to blame for this -- but it could also be preset bugs!
    tagged.text <- unlist(strsplit(tagged.text, "\t"))
    if(is.null(tagged.text)){
      stop(simpleError(paste0(
        "Awww, this should not happen: TreeTagger didn't return any useful data.\n",
        "  This can happen if the local TreeTagger setup is incomplete or different from what presets expected.\n",
        "  You should re-run your command with the option 'debug=TRUE'. That will print all relevant configuration.\n",
        "  Look for a line starting with 'sys.tt.call:' and try to execute the full command following it in a\n",
        "  command line terminal. Do not close this R session in the meantime, as 'debug=TRUE' will keep temporary\n",
        "  files that might be needed.\n",
        "  If running the command after 'sys.tt.call:' does fail, you'll need to fix the TreeTagger setup.\n",
        "  If it does *not* fail but produce a table with proper results, please contact the author!"
      )))
    } else {
      tagged.mtrx <- matrix(tagged.text, ncol=3, byrow=TRUE, dimnames=list(c(),c("token","tag","lemma")))
    }

    # add sentence endings as defined
    if(isTRUE(apply.sentc.end)){
      sntc.end.tag <- kRp.POS.tags(lang, tags="sentc", list.tags=TRUE)[[1]]
      matched.sentc.tokens <- tagged.mtrx[, "token"] %in% sentc.end
      tagged.mtrx[matched.sentc.tokens, "tag"] <- sntc.end.tag
    } else {}
    ## for debugging:
    if(isTRUE(debug)){
      return(tagged.mtrx)
    } else {}

    # add word classes, comments and numer of letters ("wclass", "desc", "lttr")
    tagged.mtrx <- treetag.com(tagged.mtrx, lang=lang, add.desc=add.desc)

    # probably apply stopword detection and stemming
    tagged.mtrx <- stopAndStem(tagged.mtrx, stopwords=stopwords, stemmer=stemmer, lowercase=TRUE)

    # add columns "idx", "sntc" and "doc_id"
    tagged.mtrx <- indexSentenceDoc(tagged.mtrx, lang=lang, doc_id=doc_id)

    results <- kRp_text(lang=lang, tokens=tagged.mtrx)
    ## descriptive statistics
    if(is.null(encoding)){
      encoding <- ""
    } else {}
    txt.vector <- readLines(takeAsFile, encoding=encoding, warn=FALSE)
    # force text into UTF-8 format
    txt.vector <- enc2utf8(txt.vector)
    Encoding(txt.vector) <- "UTF-8"
    describe(results)[[doc_id]] <- basic.tagged.descriptives(results, lang=lang, txt.vector=txt.vector, doc_id=doc_id)

    return(results)
  }
)


#' @export
#' @docType methods
#' @rdname treetag-methods
#' @aliases treetag,kRp.connection-method
#' @include 01_class_81_kRp.connection_union.R
setMethod("treetag",
  signature(file="kRp.connection"),
  function(file, treetagger="kRp.env", rm.sgml=TRUE, lang="kRp.env",
    apply.sentc.end=TRUE, sentc.end=c(".","!","?",";",":"),
    encoding=NULL, TT.options=NULL, debug=FALSE, TT.tknz=TRUE,
    format=NA, stopwords=NULL, stemmer=NULL, doc_id=NA, add.desc="kRp.env"){

    takeAsFile <- dumpTextToTempfile(text=file, encoding=encoding)
    if(!isTRUE(debug)){
      on.exit(unlink(takeAsFile), add=TRUE)
    } else {}

    results <- treetag(file=takeAsFile, treetagger=treetagger, rm.sgml=rm.sgml, lang=lang,
      apply.sentc.end=apply.sentc.end, sentc.end=sentc.end,
      encoding=encoding, TT.options=TT.options, debug=debug, TT.tknz=TT.tknz,
      format="file", stopwords=stopwords, stemmer=stemmer, doc_id=doc_id, add.desc=add.desc
    )
    
    return(results)
  }
)
unDocUMeantIt/koRpus documentation built on May 21, 2021, 9:26 p.m.