inst/doc/koRpus_vignette.R

## ----setup, include=FALSE-----------------------------------------------------
header_con <- file("vignette_header.html")
writeLines('<meta name="flattr:id" content="4zdzgd" />', header_con)
close(header_con)

## ---- include=FALSE, cache=FALSE----------------------------------------------
library(koRpus)
# manually add tag definition, the koRpus.lang.en package might be missing
koRpus::set.lang.support("kRp.POS.tags",
  ## tag and class definitions
  # en -- english
  list("en"=list(
    tag.class.def.words=matrix(c(
      "CC", "conjunction", "Coordinating conjunction",
      "CD", "number", "Cardinal number",
      "DT", "determiner", "Determiner",
      "EX", "existential", "Existential there",
      "FW", "foreign", "Foreign word",
      "IN", "preposition", "Preposition or subordinating conjunction",
      "IN/that", "preposition", "Preposition or subordinating conjunction",
      "JJ", "adjective", "Adjective",
      "JJR", "adjective", "Adjective, comparative",
      "JJS", "adjective", "Adjective, superlative",
      "LS", "listmarker", "List item marker",
      "MD", "modal", "Modal",
      "NN", "noun", "Noun, singular or mass",
      "NNS", "noun", "Noun, plural",
      "NP", "name", "Proper noun, singular",
      "NPS", "name", "Proper noun, plural",
      "NS", "noun", "Noun, plural", # undocumented, bug in parameter file?
      "PDT", "predeterminer", "Predeterminer",
      "POS", "possesive", "Possessive ending",
      "PP", "pronoun", "Personal pronoun",
      "PP$", "pronoun", "Possessive pronoun",
      "RB", "adverb", "Adverb",
      "RBR", "adverb", "Adverb, comparative",
      "RBS", "adverb", "Adverb, superlative",
      "RP", "particle", " Particle",
      "SYM", "symbol", "Symbol",
      "TO", "to", "to",
      "UH", "interjection", "Interjection",
      "VB", "verb", "Verb, base form of \"to be\"",
      "VBD", "verb", "Verb, past tense of \"to be\"",
      "VBG", "verb", "Verb, gerund or present participle of \"to be\"",
      "VBN", "verb", "Verb, past participle of \"to be\"",
      "VBP", "verb", "Verb, non-3rd person singular present of \"to be\"",
      "VBZ", "verb", "Verb, 3rd person singular present of \"to be\"",
      "VH", "verb", "Verb, base form of \"to have\"",
      "VHD", "verb", "Verb, past tense of \"to have\"",
      "VHG", "verb", "Verb, gerund or present participle of \"to have\"",
      "VHN", "verb", "Verb, past participle of \"to have\"",
      "VHP", "verb", "Verb, non-3rd person singular present of \"to have\"",
      "VHZ", "verb", "Verb, 3rd person singular present of \"to have\"",
      "VV", "verb", "Verb, base form",
      "VVD", "verb", "Verb, past tense",
      "VVG", "verb", "Verb, gerund or present participle",
      "VVN", "verb", "Verb, past participle",
      "VVP", "verb", "Verb, non-3rd person singular present",
      "VVZ", "verb", "Verb, 3rd person singular present",
      "WDT", "determiner", "Wh-determiner",
      "WP", "pronoun", "Wh-pronoun",
      "WP$", "pronoun", "Possessive wh-pronoun",
      "WRB", "adverb", "Wh-adverb"
      ), ncol=3, byrow=TRUE, dimnames=list(c(),c("tag","wclass","desc"))),
    tag.class.def.punct=matrix(c(
      ",", "comma", "Comma", # not in guidelines
      "(", "punctuation", "Opening bracket", # not in guidelines
      ")", "punctuation", "Closing bracket", # not in guidelines
      ":", "punctuation", "Punctuation", # not in guidelines
      "``", "punctuation", "Quote", # not in guidelines
      "''", "punctuation", "End quote", # not in guidelines
      "#", "punctuation", "Punctuation", # not in guidelines
      "$", "punctuation", "Punctuation" # not in guidelines
      ), ncol=3, byrow=TRUE, dimnames=list(c(),c("tag","wclass","desc"))),
    tag.class.def.sentc=matrix(c(
      "SENT", "fullstop", "Sentence ending punctuation" # not in guidelines
      ), ncol=3, byrow=TRUE, dimnames=list(c(),c("tag","wclass","desc")))
    )
  )
)
# we'll also fool hyphen() into believing "en" is an available language,
# while actually using a previously hyphenated object
fake.hyph.en <- new(
  "kRp.hyph.pat",
  lang="en",
  pattern=matrix(
    c(".im5b", ".imb", "0050"),
    ncol=3,
    dimnames=list(c(), c("orig", "char", "nums"))
  )
)
set.hyph.support(list("en"=fake.hyph.en))

## ---- set-options, echo=FALSE, cache=FALSE-----------------------------------------
options(width=85)

## ---- eval=FALSE-------------------------------------------------------------------
#  # install the language support package
#  install.koRpus.lang("en")
#  # load the package
#  library(koRpus.lang.en)

## ---- eval=FALSE-------------------------------------------------------------------
#  tagged.text <- treetag(
#    "sample_text.txt",
#    treetagger="manual",
#    lang="en",
#    TT.options=list(
#      path="~/bin/treetagger/",
#      preset="en"
#    ),
#    doc_id="sample"
#  )

## ---- include=FALSE, cache=FALSE---------------------------------------------------
tagged.text <- dget("sample_text_treetagged_dput.txt")

## ----------------------------------------------------------------------------------
tagged.text

## ----------------------------------------------------------------------------------
(tokenized.text <- tokenize(
    "sample_text.txt",
    lang="en",
    doc_id="sample"
))

## ----------------------------------------------------------------------------------
taggedText(tagged.text)[26:34,]

## ----------------------------------------------------------------------------------
head(tagged.text[["lttr"]], n=50)

## ----------------------------------------------------------------------------------
tagged.text[1:5,]

## ---- eval=FALSE-------------------------------------------------------------------
#  describe(tagged.text)

## ---- echo=FALSE-------------------------------------------------------------------
(txt_desc <- describe(tagged.text))
txt_desc_lttr <- txt_desc[["lttr.distrib"]]

## ---- eval=FALSE-------------------------------------------------------------------
#  lex.div(
#    tagged.text,
#    measure=c("TTR", "MSTTR", "MATTR","HD-D", "MTLD", "MTLD-MA"),
#    char=c("TTR", "MATTR","HD-D", "MTLD", "MTLD-MA")
#  )

## ---- echo=FALSE-------------------------------------------------------------------
lex.div(
  tagged.text,
  measure=c("TTR", "MSTTR", "MATTR","HD-D", "MTLD", "MTLD-MA"),
  char=c("TTR", "MATTR","HD-D", "MTLD", "MTLD-MA"),
  quiet=TRUE
)

## ----------------------------------------------------------------------------------
maas(tagged.text)

## ---- eval=FALSE-------------------------------------------------------------------
#  ttr.res <- TTR(tagged.text, char=TRUE)
#  plot(ttr.res@TTR.char, type="l", main="TTR degredation over text length")

## ---- echo=FALSE-------------------------------------------------------------------
ttr.res <- TTR(tagged.text, char=TRUE, quiet=TRUE)
plot(ttr.res@TTR.char, type="l", main="TTR degredation over text length")

## ----------------------------------------------------------------------------------
MSTTR(tagged.text, segment=92)

## ---- eval=FALSE-------------------------------------------------------------------
#  LCC.en <- read.corp.LCC("~/downloads/corpora/eng_news_2010_1M-text.tar")

## ---- eval=FALSE-------------------------------------------------------------------
#  query(LCC.en, "word", "what")

## ---- eval=FALSE-------------------------------------------------------------------
#  query(LCC.en, "pmio", c(780, 790))

## ---- eval=FALSE-------------------------------------------------------------------
#  freq.analysis.res <- freq.analysis(tagged.text, corp.freq=LCC.en)

## ---- eval=FALSE-------------------------------------------------------------------
#  taggedText(freq.analysis.res)

## ---- eval=FALSE-------------------------------------------------------------------
#  describe(freq.analysis.res)[["sentc.length"]]

## ---- eval=FALSE-------------------------------------------------------------------
#  describe(freq.analysis.res)$classes

## ---- eval=FALSE-------------------------------------------------------------------
#  (hyph.txt.en <- hyphen(tagged.text))

## ---- include=FALSE, cache=FALSE---------------------------------------------------
hyph.txt.en <- dget("sample_text_hyphenated_dput.txt")

## ----------------------------------------------------------------------------------
hyph.txt.en

## ----------------------------------------------------------------------------------
head(hyphenText(hyph.txt.en))

## ---- eval=FALSE-------------------------------------------------------------------
#  hyph.txt.en <- correct.hyph(hyph.txt.en)

## ---- eval=FALSE-------------------------------------------------------------------
#  hyph.txt.en <- correct.hyph(hyph.txt.en, word="mech-a-nisms", hyphen="mech-a-ni-sms")

## ---- eval=FALSE-------------------------------------------------------------------
#  readbl.txt <- readability(tagged.text, hyphen=hyph.txt.en)

## ---- echo=FALSE-------------------------------------------------------------------
suppressWarnings(readbl.txt <- readability(tagged.text, hyphen=hyph.txt.en))

## ----------------------------------------------------------------------------------
readbl.txt

## ----------------------------------------------------------------------------------
summary(readbl.txt)

## ----------------------------------------------------------------------------------
summary(readbl.txt, flat=TRUE)

## ----------------------------------------------------------------------------------
flesch.res <- flesch(tagged.text, hyphen=hyph.txt.en)
lix.res <- LIX(tagged.text)   # LIX doesn't need syllable count
lix.res

## ---- eval=FALSE-------------------------------------------------------------------
#  guessed <- guess.lang(
#    file.path(find.package("koRpus"),"tests","testthat","sample_text.txt"),
#    udhr.path="~/downloads/udhr_txt.zip"
#  )
#  summary(guessed)

Try the koRpus package in your browser

Any scripts or data that you put into this service are public.

koRpus documentation built on May 18, 2021, 1:13 a.m.