# Copyright 2010-2020 Meik Michalke <meik.michalke@hhu.de>
#
# This file is part of the R package koRpus.lang.it.
#
# koRpus.lang.it is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# koRpus.lang.it is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with koRpus.lang.it. If not, see <http://www.gnu.org/licenses/>.
# this is an internal file providing language support.
# please refer to inst/README.languages for details
#' Language support for Italian
#'
#' This function adds support for Italian to the koRpus package. You should not
#' need to call it manually, as that is done automatically when this package is
#' being loaded.
#'
#' In particular, this function adds the following:
#' \itemize{
#' \item \code{lang}: The additional language "it" to be used with koRpus
#' \item \code{treetag}: The additional preset "it", implemented according to the respective TreeTagger[1] script
#' \item \code{POS tags}: An additional set of tags, implemented using the documentation for the corresponding
#' TreeTagger parameter sets[2, 3]
#' }
#' Hyphenation patterns are provided by means of the \code{\link[sylly.it:hyph.support.it]{sylly.it}} package.
#'
#' @param ... Optional arguments for \code{\link[koRpus:set.lang.support]{set.lang.support}}.
#' @references
#' [1] \url{http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/}
#'
#' [2] \url{http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/italian-tagset.txt}
#'
#' [3] \url{http://sslmit.unibo.it/~baroni/collocazioni/itwac.tagset.txt} (alt. tagset)
#' @export
#' @importFrom koRpus set.lang.support
#' @examples
#' lang.support.it()
lang.support.it <- function(...) {
koRpus::set.lang.support("treetag",
list("it"=list(
## preset: "it"
# tags UTF-8 encoded text files
# Alberto Mirisola added this Italian section
lang="it",
encoding="UTF-8",
preset=function(TT.cmd, TT.bin, TT.lib, unix.OS){
TT.tokenizer <- file.path(TT.cmd, "utf8-tokenize.perl")
TT.abbrev <- file.path(TT.lib, "italian-abbreviations")
TT.params <- file.path(TT.lib, "italian.par")
TT.tknz.opts <- "-i"
if(isTRUE(unix.OS)){
# preset for unix systems
return(
list(
TT.tokenizer = TT.tokenizer,
TT.tagger = file.path(TT.bin, "tree-tagger"),
TT.abbrev = TT.abbrev,
TT.params = TT.params,
TT.tknz.opts = TT.tknz.opts,
TT.lookup.command = c(),
TT.filter.command = c()
)
)
} else {
# preset for windows systems
return(
list(
TT.tokenizer = TT.tokenizer,
TT.tagger = file.path(TT.bin, "tree-tagger.exe"),
TT.abbrev = TT.abbrev,
TT.params = TT.params,
TT.tknz.opts = TT.tknz.opts,
TT.lookup.command = c(),
TT.filter.command = c()
)
)
}
})
),
...
)
koRpus::set.lang.support("kRp.POS.tags",
## tag and class definitions
# it -- italian
# Alberto Mirisola added the initial Italian tags
list("it"=list(
tag.class.def.words=matrix(c(
"ABR", "abbreviation", "Abbreviation",
"ADJ", "adjective", "Adjective",
"ADV", "adverb", "Adverb",
"ADV:mente", "adverb", "Adveb ending in -mente", # alt. tag set
"ART", "article", "Article", # alt. tag set
"ARTPRE", "preposition", "Preposition + article", # alt. tag set
"AUX:fin", "auxiliary", "Auxiliary finite", # alt. tag set
"AUX:fin:cli", "auxiliary", "Auxiliary finite with clitic", # alt. tag set
"AUX:geru", "auxiliary", "Auxiliary gerundive", # alt. tag set
"AUX:geru:cli", "auxiliary", "Auxiliary gerundive with clitic", # alt. tag set
"AUX:infi", "auxiliary", "Auxiliary infinitive", # alt. tag set
"AUX:infi:cli", "auxiliary", "Auxiliary infinitive with clitic", # alt. tag set
"AUX:ppast", "auxiliary", "Auxiliary past participle", # alt. tag set
"AUX:ppre", "auxiliary", "Auxiliary present participle", # alt. tag set
"CHE", "che", "Che", # alt. tag set
"CLI", "clitic", "Clitic", # alt. tag set
"CON", "conjunction", "Conjunction",
# "DET:def" and "DET:indef" appear in *both* tag sets, as
# "article" in the default tag set and as "determiner" in the
# alternative. set to the broader term "determiner":
"DET:def", "determiner", "Definite determiner/article",
"DET:indef", "determiner", "Indefinite determiner/article", # *both* tag sets, "article" in default tag set
"DET:demo", "determiner", "Demonstrative determiner", # alt. tag set
"DET:num", "determiner", "Numeral determiner", # alt. tag set
"DET:poss", "determiner", "Possessive determiner", # alt. tag set
"DET:wh", "determiner", "Wh determiner", # alt. tag set
"INT", "interjection", "Interjection",
"FW", "foreign", "Foreign word",
"LS", "listmarker", "List item marker",
"NEG", "negation", "Negation", # alt. tag set
"NOCAT", "unknown", "Non-linguistic element", # alt. tag set
"NOM", "noun", "Noun",
"NOUN", "noun", "Noun", # alt. tag set
"NPR", "name" ,"Proper noun",
"NUM", "number", "Number",
"ORD", "number", "Ordinal number",
"PRE", "preposition", "Preposition",
"PRE:det", "preposition", "Preposition + determiner/article",
"PRO", "pronoun", "Pronoun",
"PRO:demo", "pronoun", "Demonstrative pronoun",
"PRO:indef", "pronoun", "Indefinite pronoun",
"PRO:inter", "pronoun", "Interrogative pronoun",
"PRO:num", "pronoun", "Numeral pronoun", # alt. tag set
"PRO:pers", "pronoun", "Personal pronoun",
"PRO:poss", "pronoun", "Possessive pronoun",
"PRO:refl", "pronoun", "Reflexive pronoun",
"PRO:rela", "pronoun", "Relative pronoun",
"SYM", "symbol", "Symbol",
"VER:cimp", "verb", "Verb conjunctive imperfect",
"VER:cond", "verb", "Verb conditional",
"VER:cpre", "verb", "Verb conjunctive present",
"VER:fin", "verb", "Verb finite", # alt. tag set
"VER:fin:cli", "verb", "Verb finite with clitic", # alt. tag set
"VER:futu", "verb", "Verb future tense",
"VER:geru", "verb", "Verb gerund",
"VER:geru:cli", "verb", "Verb gerundive with clitic", # alt. tag set
"VER:impe", "verb", "Verb imperative",
"VER:impf", "verb", "Verb imperfect",
"VER:infi", "verb", "Verb infinitive",
"VER:infi:cli", "verb", "Verb infinitive with clitic", # alt. tag set
"VER:ppast", "verb", "Verb past participle", # alt. tag set
"VER:ppast:cli", "verb", "Verb past participle with clitic", # alt. tag set
"VER:pper", "verb", "Verb participle perfect",
"VER:ppre", "verb", "Verb participle present",
"VER:pres", "verb", "Verb present",
"VER:refl:infi", "verb", "Verb reflexive infinitive",
"VER:remo", "verb", "Verb simple past",
"VER2:fin", "verb", "Verb finite modal/causal", # alt. tag set
"VER2:fin:cli", "verb", "Verb finite modal/causal with clitic", # alt. tag set
"VER2:geru", "verb", "Verb gerundive modal/causal", # alt. tag set
"VER2:geru:cli", "verb", "Verb gerundive modal/causal with clitic", # alt. tag set
"VER2:infi", "verb", "Verb infinitive modal/causal", # alt. tag set
"VER2:infi:cli", "verb", "Verb infinitive modal/causal with clitic", # alt. tag set
"VER2:ppast", "verb", "Verb past participle modal/causal", # alt. tag set
"VER2:ppre", "verb", "Verb present participle modal/causal", # alt. tag set
"WH", "wh", "Wh word" # alt. tag set
), ncol = 3, byrow = TRUE, dimnames = list(c(), c("tag", "wclass", "desc"))),
tag.class.def.punct=matrix(c(
"PON", "punctuation", "Punctuation",
"PUN", "punctuation", "Punctuation" # alt. tag set
), ncol = 3, byrow = TRUE, dimnames = list(c(), c("tag", "wclass", "desc"))),
tag.class.def.sentc=matrix(c(
"SENT", "fullstop", "Sentence ending punctuation"
), ncol = 3, byrow = TRUE, dimnames = list(c(), c("tag", "wclass", "desc")))
)
),
...
)
}
# this internal, non-exported function causes the language support to be
# properly added when the package gets loaded
#' @importFrom sylly.it hyph.support.it
.onAttach <- function(...) {
lang.support.it()
sylly.it::hyph.support.it()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.