R/lang.support-it.R

Defines functions .onAttach lang.support.it

Documented in lang.support.it

# Copyright 2010-2020 Meik Michalke <meik.michalke@hhu.de>
#
# This file is part of the R package koRpus.lang.it.
#
# koRpus.lang.it is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# koRpus.lang.it is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with koRpus.lang.it.  If not, see <http://www.gnu.org/licenses/>.


# this is an internal file providing language support.
# please refer to inst/README.languages for details

#' Language support for Italian
#' 
#' This function adds support for Italian to the koRpus package. You should not
#' need to call it manually, as that is done automatically when this package is
#' being loaded.
#' 
#' In particular, this function adds the following:
#' \itemize{
#'  \item \code{lang}: The additional language "it" to be used with koRpus
#'  \item \code{treetag}: The additional preset "it", implemented according to the respective TreeTagger[1] script
#'  \item \code{POS tags}: An additional set of tags, implemented using the documentation for the corresponding
#'    TreeTagger parameter sets[2, 3]
#' }
#' Hyphenation patterns are provided by means of the \code{\link[sylly.it:hyph.support.it]{sylly.it}} package.
#'
#' @param ... Optional arguments for \code{\link[koRpus:set.lang.support]{set.lang.support}}.
#' @references
#' [1] \url{http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/}
#'
#' [2] \url{http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/italian-tagset.txt}
#' 
#' [3] \url{http://sslmit.unibo.it/~baroni/collocazioni/itwac.tagset.txt} (alt. tagset)

#' @export
#' @importFrom koRpus set.lang.support
#' @examples
#' lang.support.it()

lang.support.it <- function(...) {
  koRpus::set.lang.support("treetag",
    list("it"=list(
      ## preset: "it"
      # tags UTF-8 encoded text files
      # Alberto Mirisola added this Italian section
      lang="it",
      encoding="UTF-8",
      preset=function(TT.cmd, TT.bin, TT.lib, unix.OS){
        TT.tokenizer <- file.path(TT.cmd, "utf8-tokenize.perl")
        TT.abbrev    <- file.path(TT.lib, "italian-abbreviations")
        TT.params    <- file.path(TT.lib, "italian.par")
        TT.tknz.opts <- "-i"
        if(isTRUE(unix.OS)){
          # preset for unix systems
          return(
            list(
              TT.tokenizer      = TT.tokenizer,
              TT.tagger         = file.path(TT.bin, "tree-tagger"),
              TT.abbrev         = TT.abbrev,
              TT.params         = TT.params,

              TT.tknz.opts      = TT.tknz.opts,
              TT.lookup.command = c(),
              TT.filter.command = c()
            )
          )
        } else {
          # preset for windows systems
          return(
            list(
              TT.tokenizer      = TT.tokenizer,
              TT.tagger         = file.path(TT.bin, "tree-tagger.exe"),
              TT.abbrev         = TT.abbrev,
              TT.params         = TT.params,

              TT.tknz.opts      = TT.tknz.opts,
              TT.lookup.command = c(),
              TT.filter.command = c()
            )
          )
        }
      })
    ),
    ...
  )

  koRpus::set.lang.support("kRp.POS.tags",
    ## tag and class definitions
    # it -- italian
    # Alberto Mirisola added the initial Italian tags
    list("it"=list(
      tag.class.def.words=matrix(c(
        "ABR", "abbreviation", "Abbreviation",
        "ADJ", "adjective", "Adjective",
        "ADV", "adverb", "Adverb",
        "ADV:mente", "adverb", "Adveb ending in -mente", # alt. tag set
        "ART", "article", "Article", # alt. tag set
        "ARTPRE", "preposition", "Preposition + article", # alt. tag set
        "AUX:fin", "auxiliary", "Auxiliary finite", # alt. tag set
        "AUX:fin:cli", "auxiliary", "Auxiliary finite with clitic", # alt. tag set
        "AUX:geru", "auxiliary", "Auxiliary gerundive", # alt. tag set
        "AUX:geru:cli", "auxiliary", "Auxiliary gerundive with clitic", # alt. tag set
        "AUX:infi", "auxiliary", "Auxiliary infinitive", # alt. tag set
        "AUX:infi:cli", "auxiliary", "Auxiliary infinitive with clitic", # alt. tag set
        "AUX:ppast", "auxiliary", "Auxiliary past participle", # alt. tag set
        "AUX:ppre", "auxiliary", "Auxiliary present participle", # alt. tag set
        "CHE", "che", "Che", # alt. tag set
        "CLI", "clitic", "Clitic", # alt. tag set
        "CON", "conjunction", "Conjunction",
        # "DET:def" and "DET:indef" appear in *both* tag sets, as
        # "article" in the default tag set and as "determiner" in the
        # alternative. set to the broader term "determiner":
        "DET:def", "determiner", "Definite determiner/article",
        "DET:indef", "determiner", "Indefinite determiner/article", # *both* tag sets, "article" in default tag set
        "DET:demo", "determiner", "Demonstrative determiner", # alt. tag set
        "DET:num", "determiner", "Numeral determiner", # alt. tag set
        "DET:poss", "determiner", "Possessive determiner", # alt. tag set
        "DET:wh", "determiner", "Wh determiner", # alt. tag set
        "INT", "interjection", "Interjection",
        "FW", "foreign", "Foreign word",
        "LS", "listmarker", "List item marker",
        "NEG", "negation", "Negation", # alt. tag set
        "NOCAT", "unknown", "Non-linguistic element", # alt. tag set
        "NOM", "noun", "Noun",
        "NOUN", "noun", "Noun", # alt. tag set
        "NPR", "name" ,"Proper noun",
        "NUM", "number", "Number",
        "ORD", "number", "Ordinal number",
        "PRE", "preposition", "Preposition",
        "PRE:det", "preposition", "Preposition + determiner/article",
        "PRO", "pronoun", "Pronoun",
        "PRO:demo", "pronoun", "Demonstrative pronoun",
        "PRO:indef", "pronoun", "Indefinite pronoun",
        "PRO:inter", "pronoun", "Interrogative pronoun",
        "PRO:num", "pronoun", "Numeral pronoun", # alt. tag set
        "PRO:pers", "pronoun", "Personal pronoun",
        "PRO:poss", "pronoun", "Possessive pronoun",
        "PRO:refl", "pronoun", "Reflexive pronoun",
        "PRO:rela", "pronoun", "Relative pronoun",
        "SYM", "symbol", "Symbol",
        "VER:cimp", "verb", "Verb conjunctive imperfect",
        "VER:cond", "verb", "Verb conditional",
        "VER:cpre", "verb", "Verb conjunctive present",
        "VER:fin", "verb", "Verb finite", # alt. tag set
        "VER:fin:cli", "verb", "Verb finite with clitic", # alt. tag set
        "VER:futu", "verb", "Verb future tense",
        "VER:geru", "verb", "Verb gerund",
        "VER:geru:cli", "verb", "Verb gerundive with clitic", # alt. tag set
        "VER:impe", "verb", "Verb imperative",
        "VER:impf", "verb", "Verb imperfect",
        "VER:infi", "verb", "Verb infinitive",
        "VER:infi:cli", "verb", "Verb infinitive with clitic", # alt. tag set
        "VER:ppast", "verb", "Verb past participle", # alt. tag set
        "VER:ppast:cli", "verb", "Verb past participle with clitic", # alt. tag set
        "VER:pper", "verb", "Verb participle perfect",
        "VER:ppre", "verb", "Verb participle present",
        "VER:pres", "verb", "Verb present",
        "VER:refl:infi", "verb", "Verb reflexive infinitive",
        "VER:remo", "verb", "Verb simple past",
        "VER2:fin", "verb", "Verb finite modal/causal", # alt. tag set
        "VER2:fin:cli", "verb", "Verb finite modal/causal with clitic", # alt. tag set
        "VER2:geru", "verb", "Verb gerundive modal/causal", # alt. tag set
        "VER2:geru:cli", "verb", "Verb gerundive modal/causal with clitic", # alt. tag set
        "VER2:infi", "verb", "Verb infinitive modal/causal", # alt. tag set
        "VER2:infi:cli", "verb", "Verb infinitive modal/causal with clitic", # alt. tag set
        "VER2:ppast", "verb", "Verb past participle modal/causal", # alt. tag set
        "VER2:ppre", "verb", "Verb present participle modal/causal", # alt. tag set
        "WH", "wh", "Wh word" # alt. tag set
        ), ncol = 3, byrow = TRUE, dimnames = list(c(), c("tag", "wclass", "desc"))),
      tag.class.def.punct=matrix(c(
        "PON", "punctuation", "Punctuation",
        "PUN", "punctuation", "Punctuation" # alt. tag set
        ), ncol = 3, byrow = TRUE, dimnames = list(c(), c("tag", "wclass", "desc"))),
      tag.class.def.sentc=matrix(c(
        "SENT", "fullstop", "Sentence ending punctuation"
        ), ncol = 3, byrow = TRUE, dimnames = list(c(), c("tag", "wclass", "desc")))
      )
    ),
    ...
  )
}

# this internal, non-exported function causes the language support to be
# properly added when the package gets loaded
#' @importFrom sylly.it hyph.support.it
.onAttach <- function(...) {
  lang.support.it()
  sylly.it::hyph.support.it()
}
unDocUMeantIt/koRpus.lang.it documentation built on Oct. 25, 2020, 10:21 a.m.