R/sylly-internal.R

Defines functions check_lang_packages check.file value.distribs is.supported.lang load.hyph.pattern optimize.hyph.pattern kRp.hyphen.calc hyphen.word write.hyph.cache.file read.hyph.cache.file set.hyph.cache check.hyph.cache get.hyph.cache explode.word explode.letters koRpus2sylly

# Copyright 2010-2020 Meik Michalke <meik.michalke@hhu.de>
#
# This file is part of the R package sylly.
#
# sylly is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# sylly is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with sylly.  If not, see <http://www.gnu.org/licenses/>.

#' @importFrom stats setNames
#' @importFrom utils data file_test head modifyList setTxtProgressBar tail txtProgressBar

## function koRpus2sylly()
# transitional internal function to transform hyphen pattern data, that was
# saved using koRpus, into sylly data. the problem with the old files is that
# the S4 objects were saved with an attribute refering to the koRpus package,
# making it impossible to use sylly *without* also loading koRpus automatically
# - path: full path to an old *.rda file
koRpus2sylly <- function(path, replaceFile=TRUE){
  intEnv <- new.env()
  path <- normalizePath(path, mustWork=TRUE)
  load(path, envir=intEnv)
  hyphPatName <- ls(envir=intEnv)
  hyphPat <- intEnv[[hyphPatName]]
  currentPackage <- attr(class(hyphPat), "package")
  if(identical(currentPackage, "koRpus")){
    message(paste0("converting ", hyphPatName, " from koRpus to sylly"))
    attr(class(hyphPat), "package") <- "sylly"
    intEnv[[hyphPatName]] <- hyphPat
    if(isTRUE(replaceFile)){
      save(
        list=hyphPatName,
        envir=intEnv,
        file=path,
        compress="xz",
        compression_level=-9
      )
    }
  } else {
    message(paste0(hyphPatName, " is not a koRpus object (package: ", currentPackage, ")"))
  }
  return(invisible(NULL))
} ## end function koRpus2sylly()


## function explode.letters()
# all possible values of .word.:
# 6 x 1: . w o r d .
#        1 2 3 4 5 6
# 5 x 2: .w wo or rd d.
#        12 23 34 45 56
# 4 x 3: .wo wor ord rd.
#        123 234 345 456
# 3 x 4: .wor word ord.
#        1234 2345 3456
# 2 x 5: .word word.
#        12345 23456
# 1 x 6: .word.
#        123456
#
explode.letters <- function(max.word.length=get.sylly.env(hyph.max.token.length=TRUE)){
  result <- lapply(
    1:max.word.length,
    function(wl){
      lapply(
        1:wl,
        function(sl){
          lapply(
            1:(wl-sl+1),
            function(x){
              x:(x+sl-1)
            }
          )
        }
      )
    }
  )
  return(result)
} ## end function explode.letters()


## function explode.word()
# using the provided patterns, split an actual word into its subpatterns
# - min.pattern/max.pattern: set the minimum and maximum length of available
#   patterns, makes no sense to split further in the first place
explode.word <- function(word, min.pattern=2L, max.pattern=5L){
  word.length <- nchar(word)
  hyph.max.token.length <- get.sylly.env(hyph.max.token.length=TRUE)
  if(word.length > hyph.max.token.length){
    # update internal pattern cache to cope with longer words
    set.sylly.env(hyph.max.token.length=word.length + 5)
  } else {}
  if(word.length <= min.pattern){
    result <- data.frame(frag=word, on=1L, off=min(word.length, min.pattern))
  } else {
    all.patterns <- mget("all.patterns", envir=as.environment(.sylly.env), ifnotfound=list(NULL))[["all.patterns"]]
    result <- matrix(unlist(vapply(
      unlist(all.patterns[[word.length]][min.pattern:min(word.length, max.pattern)], recursive=FALSE),
      function(lttrs){
        return(
          # already include a dummy 'match' row
          c(
            substr(word, lttrs[1], max(lttrs)), # frag
            lttrs[1],                           # on
            max(lttrs),                         # off
            NA                                  # match
          )
        )
      },
      FUN.VALUE=c(frag="", on=0, off=0, match=NA),
      USE.NAMES=FALSE
    )), nrow=4L, dimnames=list(c("frag","on","off","match"),NULL))
  }
  return(result)
} ## function explode.word()


## function get.hyph.cache()
get.hyph.cache <- function(lang){
  # don't try anything while cache is locked
  locked <- mget("hyphenCacheLock", envir=as.environment(.sylly.env), ifnotfound=list(hyphenCacheLock=FALSE))[["hyphenCacheLock"]]
  while(isTRUE(locked)){
    Sys.sleep(0.5)
    locked <- mget("hyphenCacheLock", envir=as.environment(.sylly.env), ifnotfound=list(hyphenCacheLock=FALSE))[["hyphenCacheLock"]]
  }
  # simply get cache from current sylly environment
  # returns NULL if none exists
  if(isTRUE(lang == "all")){
    return(mget("hyphenCache", envir=as.environment(.sylly.env), ifnotfound=list(NULL))[["hyphenCache"]])
  } else {
    return(mget("hyphenCache", envir=as.environment(.sylly.env), ifnotfound=list(NULL))[["hyphenCache"]][[lang]])
  }
}
## end function get.hyph.cache()


## function check.hyph.cache()
# called by hyphen(), returns either the cached entry, or NULL
# - missing: if TRUE, all elements of token are looked up at once, and
#   a vector of all missing tokens is returned, or NULL
# - multiple: if TRUE, token must be a character vector, result is a data.frame
check.hyph.cache <- function(lang, token, cache=get.hyph.cache(lang=lang), missing=FALSE, multiple=FALSE){
  result <- NULL
  if(is.null(cache)){
    # no cache, no hit...
    if(isTRUE(missing)){
      result <- token
    } else {}
  } else {
    if(isTRUE(missing)){
      missing.tokens <- token[!token %in% names(cache)]
      if(length(missing.tokens) > 0){
        result <- missing.tokens
      } else {}
    } else if(isTRUE(multiple)){
      # update fields with data from cache if available
      result <- as.data.frame(t(
        vapply(
          token,
          function(tk){
            inCache <- cache[[tk]]
            if(is.null(inCache)){
              return(c(syll=1, word=tk))
            } else {
              return(c(syll=inCache[["syll"]], word=inCache[["word"]]))
            }
          },
          FUN.VALUE=c(syll=0, word=""),
          USE.NAMES=FALSE
        )
      ), stringsAsFactors=FALSE)
      colnames(result) <- c("syll", "word")
      result[["syll"]] <- as.numeric(result[["syll"]])
    } else {
      # check if this word was hyphenated before;
      # will be NULL if not found
      result <- cache[[token]]
    }
  }
  return(result)
} ## end function check.hyph.cache()


## function set.hyph.cache()
# writes (probably new) cache data back to the environment
# - append: a named list of new data (name is the token, value another named list
#           containing "syll" and "word", respectively)
set.hyph.cache <- function(lang, append=NULL, cache=get.hyph.cache(lang=lang)){
  # don't try anything while cache is locked
  locked <- mget("hyphenCacheLock", envir=as.environment(.sylly.env), ifnotfound=list(hyphenCacheLock=FALSE))[["hyphenCacheLock"]]
  while(isTRUE(locked)){
    Sys.sleep(0.1)
    locked <- mget("hyphenCacheLock", envir=as.environment(.sylly.env), ifnotfound=list(hyphenCacheLock=FALSE))[["hyphenCacheLock"]]
  }
  # now *we* lock the cache
  assign("hyphenCacheLock", list(hyphenCacheLock=TRUE), pos=as.environment(.sylly.env))
  all.kRp.env.hyph <- mget("hyphenCache", envir=as.environment(.sylly.env), ifnotfound=list(NULL))[["hyphenCache"]]
  if(is.null(all.kRp.env.hyph)){
    all.kRp.env.hyph <- new.env()
  } else {}
  # append result to cache
  if(!is.null(append)){
    # could be there is no cache yet
    if(is.null(cache)){
      cache <- new.env()
    } else {}
    # using arbitrary character stuff for names might fail
      try(
        # 'all.names=TRUE' is needed to not lose all tokens that begin with a dot!
        cache <- list2env(as.list(append, all.names=TRUE), envir=cache)
#         cache <- as.environment(modifyList(as.list(cache, all.names=TRUE), as.list(append, all.names=TRUE)))
      )
  } else {
    if(is.null(cache)){
      # hm, if both is null, don't do anything
      assign("hyphenCacheLock", list(hyphenCacheLock=FALSE), pos=as.environment(.sylly.env))
      return(invisible(NULL))
    } else {}
  }

  if(isTRUE(lang == "all")){
    all.kRp.env.hyph <- cache
  } else {
    all.kRp.env.hyph[[lang]] <- cache
  }
  assign("hyphenCache", all.kRp.env.hyph, envir=as.environment(.sylly.env))
  # unlock cache
  assign("hyphenCacheLock", list(hyphenCacheLock=FALSE), pos=as.environment(.sylly.env))
  return(invisible(NULL))
} ## end function set.hyph.cache()


## function read.hyph.cache.file()
# reads a dumped chace file, if "file" is not NULL and does exist
read.hyph.cache.file <- function(lang, file=get.sylly.env(hyph.cache.file=TRUE, errorIfUnset=FALSE), quiet=FALSE){
  if(is.null(file)){
    return(invisible(NULL))
  } else {}

  cache.file.path <- normalizePath(file, mustWork=FALSE)
  if(!file.exists(cache.file.path)){
    # if the file is not there yet, create one
    write.hyph.cache.file(quiet=quiet)
  } else {}
  # only reload the file if it changed or wasn't loaded at all yet
  cacheFileInfo.new <- file.mtime(cache.file.path)
  cacheFileInfo.old <- mget("hyphenCacheFile", envir=as.environment(.sylly.env), ifnotfound=list(NULL))[["hyphenCacheFile"]]
  if(identical(cacheFileInfo.new, cacheFileInfo.old[[lang]])){
    # file doesn't seem to have changed
    return(invisible(NULL))
  } else if(is.null(cacheFileInfo.old)){
    # this must be the first time we try to read the file
    cacheFileInfo.old <- list()
  } else {}
  # set sylly.hyph.cache to NULL to suppress R CMD check warning
  sylly.hyph.cache <- NULL
  load(cache.file.path)
  # data will be checked by set.hyph.cache(), so no need to worry here
  # but the loaded data must contain an environment named "sylly.hyph.cache"
  if(is.null(sylly.hyph.cache)){
    stop(simpleError("The cache file you provided does not contain sylly-ready hyphenation data!"))
  } else {
    # cache format changed with 0.10-2, make sure we're good
    if(is.data.frame(sylly.hyph.cache)){
      warning("Cache file format has changed, trying conversion. If you run into errors, delete your old cache files!", call.=FALSE)
      sylly.hyph.cache <- setNames(
        object=lapply(
          seq_along(sylly.hyph.cache[["token"]]),
          function(n){
            return(
              list(
                syll=as.numeric(sylly.hyph.cache[n,"syll"]),
                word=as.character(sylly.hyph.cache[n,"word"])
              )
            )
          }
        ),
        nm=sylly.hyph.cache[["token"]]
      )
    } else {}
  }
  # set new file data to prevent from reloading if unchanged
  cacheFileInfo.old[[lang]] <- cacheFileInfo.new
  assign("hyphenCacheFile", cacheFileInfo.old, envir=as.environment(.sylly.env))

  # write loaded data to environment
  set.hyph.cache(lang="all", append=sylly.hyph.cache, cache=get.hyph.cache(lang="all"))

  return(invisible(NULL))
} ## end function read.hyph.cache.file()


## function write.hyph.cache.file()
# dumps cache data into a file, if "file" is not NULL. if it doesn't exist, it will be created
write.hyph.cache.file <- function(file=get.sylly.env(hyph.cache.file=TRUE, errorIfUnset=FALSE), quiet=FALSE){
  if(is.null(file)){
    return(invisible(NULL))
  } else {}

  cache.file.path <- normalizePath(file, mustWork=FALSE)
  if(!file.exists(cache.file.path)){
    if(!isTRUE(quiet)){
      message(paste0("Cache file does not exist and will be created:\n  ", cache.file.path))
    } else {}
  } else {}

  sylly.hyph.cache <- get.hyph.cache(lang="all")
  # if there is no cache yet, make it an empty environment
  if(is.null(sylly.hyph.cache)){
    sylly.hyph.cache <- list()
  } else {}
  save(sylly.hyph.cache, file=cache.file.path)

  return(invisible(NULL))
} ## end function write.hyph.cache.file()


## function hyphen.word()
# this helper function is being called by kRp.hyphen.calc(), see below
hyphen.word <- function(
    word,
    hyph.pattern=NULL,
    min.length=4L,
    rm.hyph=TRUE,
    min.pattern=2L,
    max.pattern=5L,
    as.cache=FALSE
  ){
    # consider min length of word?
    if(nchar(word) < min.length){
      if(isTRUE(as.cache)){
        return(list(syll=1, word=word))
      } else {
        return(c(syll=1, word=word))
      }
    } else {}
    word.orig <- word
    ## remove hyphens in word
    if(isTRUE(rm.hyph)){
      word <- gsub("-", "", word)
    } else {}
    # non-letters like leading dots confuse the algorithm. we'll remove any non-alphabetic character
    word <- gsub("[^\\p{L}]+", "", word, perl=TRUE)
    # if this removed all of the token, guess we're finished
    if (identical(word, "")){
      if(isTRUE(as.cache)){
        return(list(syll=1, word=word.orig))
      } else {
        return(c(syll=1, word=word.orig))
      }
    } else {}
    ## convert to lowercase
    word <- tolower(word)
    ## transform "word" to ".word."
    word.dotted <- paste0(".", word, ".")
    word.length <- nchar(word.dotted)

    # create word fragments ".wo", ".wor"... "rd."
    matched.patterns <- explode.word(word.dotted, min.pattern=min.pattern, max.pattern=max.pattern)
    # find all matching patterns of the word fragments
    matched.patterns["match",] <- vapply(
      matched.patterns["frag",],
      function(f){
        in.patterns <- slot(hyph.pattern, "pattern")[[f]]
        return(ifelse(is.null(in.patterns), "", in.patterns[["nums"]]))
      },
      FUN.VALUE="",
      USE.NAMES=FALSE
    )
    # now let's add the found matches and find the maximum
    matched.pat.index <- !"" == matched.patterns["match",]
    if(sum(matched.pat.index) > 0){
      pattern.matrix <- vapply(
        which(matched.pat.index),
        function(got.match){
          word.on <- max(1, (as.numeric(matched.patterns["on",got.match]) - 1))
          word.off <- as.numeric(matched.patterns["off",got.match])
          match.num.code <- unlist(strsplit(matched.patterns["match",got.match], split=""))
          return(c(rep(0, word.on - 1), match.num.code, rep(0, word.length - word.off)))
        },
        FUN.VALUE=rep("", word.length),
        USE.NAMES=FALSE
      )
      # this is the vector with the max values for the dotted word
      pattern.max <- as.numeric(apply(pattern.matrix, 1, max))

      # we'll never hyphenate before a word...
      pattern.max <- pattern.max[-c(1, length(pattern.max))]
      # ... never after the first or before the last letter
      pattern.max[c(1,length(pattern.max)-1,length(pattern.max))] <- 0

      # filter odd positions (count syllables)
      possible.hyphs <- (pattern.max %% 2) != 0
      syllables <- sum(possible.hyphs) + 1
      # recreate word with hyphens
      add.hyphen <- which(possible.hyphs)
      if(isTRUE(rm.hyph)){
        hyph.word <- unlist(strsplit(gsub("-", "", word.orig), split=""))
      } else {
        hyph.word <- unlist(strsplit(word.orig, split=""))
      }
      hyph.word[add.hyphen] <- paste0(hyph.word[add.hyphen], "-")
      # in cases where previous hyphenations were already removed and here returned,
      # don't return double them up
      hyph.word <- gsub("-+", "-", paste(hyph.word, collapse=""))
      if(isTRUE(as.cache)){
        hyph.result <- list(syll=syllables, word=hyph.word)
      } else {
        hyph.result <- c(syll=syllables, word=hyph.word)
      }
    } else {
      ## no hyphenation
      if(isTRUE(as.cache)){
        hyph.result <- list(syll=1, word=word.orig)
      } else {
        hyph.result <- c(syll=1, word=word.orig)
      }
    }
    # this will return *three* elements if as.cache is TRUE
    return(hyph.result)
} ## end function hyphen.word()


# this internal function does the real hyphenations,
# so it's mostly called by hyphen()

########################################################
## if this signature changes, check hyphen() as well! ##
########################################################

# min.length is set to 4 because we'll never hyphenate after the first of before the last letter, so
# words with three letters or less cannot be hyphenated
kRp.hyphen.calc <- function(
  words,
  hyph.pattern=NULL,
  min.length=4L,
  rm.hyph=TRUE,
  quiet=FALSE,
  cache=TRUE,
  lang=NULL,
  as="kRp.hyphen" # can also be "data.frame" or "numeric" to return simpler objects.
                  # "numeric" only returns the numeric results for each word/token
){

  stopifnot(is.character(words))

  # to avoid needless NOTEs from R CMD check
  token <- NULL
  # set a variable to check if we changed the data at all, to later skip the writing back part if possible
  # this is only relevant if cache=TRUE, but needs to be present for a check later on
  writeBackCache <- new.env()
  assign("changed", FALSE, envir=writeBackCache)

  # check for hyphenation pattern.
  if(is.null(hyph.pattern)){
    if(!is.null(lang)){
      # this way the text object defines pattern language
      hyph.pattern <- load.hyph.pattern(lang)
    } else {
      stop(simpleError("No language definition available. Set \"hyph.pattern\"!"))
    }
  } else {
    if(!inherits(hyph.pattern, "kRp.hyph.pat")){
      # the internal function load.hyph.pattern() will return what we need
      hyph.pattern <- load.hyph.pattern(hyph.pattern)
    } else {
      # optimize the pattern object
      hyph.pattern <- optimize.hyph.pattern(hyph.pattern)
    }
    # the other way: take language from hyph.pattern
    # overwrites lang in tagged.text
    lang <- slot(hyph.pattern, "lang")
  }
  if(isTRUE(cache)){
    # check if cached hyphenation data has been set with set.sylly.env().
    # if so, the data will directly be coped to sylly's environment
    read.hyph.cache.file(lang=lang, file=get.sylly.env(hyph.cache.file=TRUE, errorIfUnset=FALSE), quiet=quiet)
  } else {}

  if(!isTRUE(quiet)){
    # feed back the hypenation we're using
    message(paste0("Hyphenation (language: ", lang, ")"))
  } else {}

  # min-lenth and max-length of patterns
  min.pat <- slot(hyph.pattern, "min.pat")
  max.pat <- slot(hyph.pattern, "max.pat")

  ## main loop
  # build a vector with all possible word fragments
  # check for matches of the fragment vector in the pattern db
  uniqueWords <- unique(words)

  if(!isTRUE(quiet)){
    # counter to get some feedback
    .iter.counter <- new.env()
    assign("counter", 1, envir=.iter.counter)
  } else {}

  if(isTRUE(cache)){
    # the fastest way to fill the cache is to first check what types are missing,
    # hyphenate them and append them to cache in *one* go and *then* fetch results
    # for the actual hyphenation all from cache; omit uncachable tokens in the first place
    typesMissingInCache  <- check.hyph.cache(lang=lang, token=uniqueWords[nchar(uniqueWords) >= min.length], missing=TRUE)
    if(length(typesMissingInCache) > 0){
      if(!isTRUE(quiet)){
        # give some feedback, so we know the machine didn't just freeze...
        prgBar <- txtProgressBar(min=0, max=length(typesMissingInCache), style=3)
      } else {}
      typesMissingHyphenated <- setNames(
        object=lapply(
          typesMissingInCache,
          function(nw){
            if(!isTRUE(quiet)){
              # update prograss bar
              iteration.counter <- get("counter", envir=.iter.counter)
              setTxtProgressBar(prgBar, iteration.counter)
              assign("counter", iteration.counter + 1, envir=.iter.counter)
            } else {}
            return(hyphen.word(
              nw,
              hyph.pattern=hyph.pattern,
              min.length=min.length,
              rm.hyph=rm.hyph,
              min.pattern=min.pat,
              max.pattern=max.pat,
              as.cache=TRUE
            ))
          }
        ),
        nm=typesMissingInCache
      )
      typesMissingHyphenated[["syll"]] <- as.numeric(typesMissingHyphenated[["syll"]])
      set.hyph.cache(lang=lang, append=typesMissingHyphenated)
      assign("changed", TRUE, envir=writeBackCache)
      if(!isTRUE(quiet)){
        # close prograss bar
        close(prgBar)
      } else {}
    } else {}
    # fetch results from cache in one go
    hyph.df <- check.hyph.cache(lang=lang, token=words, multiple=TRUE)
  } else {
    if(!isTRUE(quiet)){
      # give some feedback, so we know the machine didn't just freeze...
      prgBar <- txtProgressBar(min=0, max=length(uniqueWords), style=3)
    } else {}
    hyphenate.results <- t(vapply(
      uniqueWords,
      FUN=function(nw){
        if(!isTRUE(quiet)){
          # update prograss bar
          iteration.counter <- get("counter", envir=.iter.counter)
          setTxtProgressBar(prgBar, iteration.counter)
          assign("counter", iteration.counter + 1, envir=.iter.counter)
        } else {}
        return(
          hyphen.word(
            nw,
            hyph.pattern=hyph.pattern,
            min.length=min.length,
            rm.hyph=rm.hyph,
            min.pattern=min.pat,
            max.pattern=max.pat,
            as.cache=FALSE
          )
        )
      },
      FUN.VALUE=c(syll=0, word=""),
      USE.NAMES=TRUE
    ))
    hyph.df <- as.data.frame(hyphenate.results[as.character(words), , drop=FALSE], stringsAsFactors=FALSE)
    colnames(hyph.df) <- c("syll","word")
    rownames(hyph.df) <- NULL
    hyph.df[["syll"]] <- as.numeric(hyph.df[["syll"]])
    if(!isTRUE(quiet)){
      # close prograss bar
      close(prgBar)
    } else {}
  }

  ## compute descriptive statistics
  num.syll <- sum(hyph.df$syll, na.rm=TRUE)
  syll.distrib <- value.distribs(hyph.df$syll)
  syll.uniq.distrib <- value.distribs(unique(hyph.df)$syll)
  avg.syll.word <- mean(hyph.df$syll, na.rm=TRUE)
  syll.per100 <- avg.syll.word * 100

  desc.stat.res <- list(
    num.syll=num.syll,
    syll.distrib=syll.distrib,
    syll.uniq.distrib=syll.uniq.distrib,
    avg.syll.word=avg.syll.word,
    syll.per100=syll.per100
  )

  if(all(isTRUE(cache), isTRUE(get("changed", envir=writeBackCache)))){
    # check if cached hyphenation data has been set with set.sylly.env().
    # if so and if we added to it here, the current data will be written back to that file
    write.hyph.cache.file(file=get.sylly.env(hyph.cache.file=TRUE, errorIfUnset=FALSE), quiet=quiet)
  } else {}

  if(identical(as, "kRp.hyphen")){
    results <- kRp_hyphen(lang=lang, desc=desc.stat.res, hyphen=hyph.df)
  } else if(identical(as, "data.frame")){
    results <- hyph.df
  } else if(identical(as, "numeric")){
    results <- hyph.df[["syll"]]
  } else {
    stop(simpleError("'as' must be one of \"kRp.hyphen\", \"data.frame\", or \"numeric\"!"))
  }

  return(results)
} ## end function kRp.hyphen.calc()


## function optimize.hyph.pattern()
# replaces the pattern matrix with a hashed environment
optimize.hyph.pattern <- function(hyph.pat){
  pattern.matrix <- slot(hyph.pat, "pattern")
  pattern.list <- setNames(
    object=lapply(
      seq_along(pattern.matrix[,"char"]),
      function(n){
        return(
          list(
            # nums must remain character to keep leading zeroes!
            nums=as.character(pattern.matrix[n,"nums"]),
            orig=as.character(pattern.matrix[n,"orig"])
          )
        )
      }
    ),
    nm=pattern.matrix[,"char"]
  )
  # "kRp.hyph.pat.env" is an internal class, defined in 01_class_01_kRp.hyph.pat.R
  new.hyph.pat <- kRp_hyph_pat_env(
    lang=slot(hyph.pat, "lang"),
    min.pat=min(nchar(pattern.matrix[,"char"])),
    max.pat=max(nchar(pattern.matrix[,"char"])),
    pattern=as.environment(pattern.list)
  )
  return(new.hyph.pat)
} ## end function optimize.hyph.pattern()


## function load.hyph.pattern()
load.hyph.pattern <- function(lang){
  # to avoid needless NOTEs from R CMD check
  hyph.pat <- NULL

  lang <- is.supported.lang(lang, support="hyphen")
  # check for additional package information, in case we're
  # importing hyphen patterns from a third party package
  lang.names <- names(lang) %in% "package"
  if(length(lang) > 1 & any(lang.names)){
    hyph.package <- lang[["package"]]
    lang <- lang[!lang.names][[1]]
  } else {
    hyph.package <- "sylly"
  }
  # we'll populate the internal environment with optimized patterns
  if(!exists(paste0("hyph.", lang), envir=as.environment(.sylly.env), inherits=FALSE)){
    # we'll load the hyphen pattern, get it here and check its format
    # this way packages can carry both old and new pattern formats
    data(list=paste0("hyph.", lang), package=hyph.package, envir=as.environment(.sylly.env))
    hyph.pat.optim <- get(paste0("hyph.", lang), envir=as.environment(.sylly.env))
    if(!inherits(hyph.pat.optim, "kRp.hyph.pat.env")){
      # optimization is only needed for packages with old pattern objects
      # this should not be an issue, as it's quite fast and happens only once a pattern set is loaded
      hyph.pat.optim <- optimize.hyph.pattern(hyph.pat.optim)
      # replace hyph.XX with optimized object
      assign(paste0("hyph.", lang), hyph.pat.optim, envir=as.environment(.sylly.env))
    } else {}
  } else {
    hyph.pat.optim <- get(paste0("hyph.", lang), envir=as.environment(.sylly.env))
  }
  # return optimized patterns
  return(hyph.pat.optim)
} ## end function load.hyph.pattern()


## function is.supported.lang()
# determins if a language is supported, and returns the correct identifier
is.supported.lang <- function(lang.ident, support="hyphen"){
  if(identical(support, "hyphen")){
    hyphen.supported <- as.list(as.environment(.sylly.env))[["langSup"]][["hyphen"]][["supported"]]
    if(any(names(hyphen.supported) == lang.ident)){
      res.ident <- hyphen.supported[[lang.ident]]
    } else {
      stop(simpleError(paste0("Unknown language: \"", lang.ident, "\".\nPlease provide a valid hyphenation pattern!")))
    }
  } else {}

  return(res.ident)
} ## end function is.supported.lang()


## function value.distribs()
value.distribs <- function(value.vect, omit.missings=TRUE){
  vector.length <- length(unlist(value.vect))
  vector.summary <- summary(as.factor(value.vect))

  # try to fill up missing values with 0, e.g., found no words with 5 characters
  if(!isTRUE(omit.missings)){
    if(!is.numeric(value.vect)){
      # makes only sense for numeric values
      stop(simpleError("value.distribs(): Impute missings is only valid for numeric vectors!"))
    } else {}
    present.values <- as.numeric(names(vector.summary))
    max.value <- max(present.values)
    missing.values <- c(1:max.value)[!c(1:max.value) %in% present.values]
    append.to.summary <- rep(0, length(missing.values))
    names(append.to.summary) <- as.character(missing.values)
    vector.summary <- c(vector.summary, append.to.summary)
    # finally sort the outcome
    vector.summary <- vector.summary[order(as.numeric(names(vector.summary)))]
  } else {}

  # add cumulative values
  vect.summ.cum <- cumsum(vector.summary)
  # inverse 
  vect.summ.inv <- rbind(vector.summary, vect.summ.cum, vector.length - vect.summ.cum)
  # add percentages
  vect.summ.cum.pct <- rbind(vect.summ.inv, (vect.summ.inv * 100 / vector.length))
  dimnames(vect.summ.cum.pct)[[1]] <- c("num", "cum.sum", "cum.inv", "pct", "cum.pct", "pct.inv")
  return(vect.summ.cum.pct)
} ## end function value.distribs()


## function check.file()
# helper function for file checks
check.file <- function(filename, mode="exist", stopOnFail=TRUE){

  ret.value <- FALSE

  if(identical(mode, "exist") | identical(mode, "exec")){
    if(as.logical(file_test("-f", filename))){
      ret.value <- TRUE
    } else {
      if(isTRUE(stopOnFail)){
        stop(simpleError(paste("Specified file cannot be found:\n", filename)))
      } else {}
      ret.value <- FALSE
    }
  } else {}

  if(identical(mode, "exec")){
    if(as.logical(file_test("-x", filename))){
      ret.value <- TRUE
    } else {
      if(isTRUE(stopOnFail)){
        stop(simpleError(paste("Specified file cannot be executed:\n", filename)))
      } else {}
      ret.value <- FALSE
    }
  } else {}

  if(identical(mode, "dir")){
    if(as.logical(file_test("-d", filename))){
      ret.value <- TRUE
    } else {
      if(isTRUE(stopOnFail)){
        stop(simpleError(paste("Specified directory cannot be found:\n", filename)))
      } else {}
      ret.value <- FALSE
    }
  } else {}

  return(ret.value)
} ## end function check.file()


## function check_lang_packages()
# checks what sylly.** packages are currently installed or loaded
# returns a named list with a list for each installed package, providing
# entries named "available", "installed", "loaded", and "title"
# availabe: also check for all available packages in 'repos'
# available.only: omit all installed packages which cannot be found in 'repos'
#' @importFrom utils available.packages packageDescription
check_lang_packages <- function(available=FALSE, repos="https://undocumeantit.github.io/repos/l10n/", available.only=FALSE, pattern="^sylly\\.[[:alpha:]]+$"){
  ### this function should be kept close to identical to the respective function
  ### in the 'sylly' package, except for the pattern
  result <- list()
  if(isTRUE(available)){
    available_packages <- utils::available.packages(repos=repos)
    available_koRpus_lang <- grepl(pattern, available_packages[,"Package"])
    supported_lang <- unique(available_packages[available_koRpus_lang,"Package"])
  } else {
    available_koRpus_lang <- FALSE
    supported_lang <- NULL
  }

  loaded_packages <- loadedNamespaces()
  loaded_koRpus_lang <- grepl(pattern, loaded_packages)
  installed_packages <- unique(dir(.libPaths()))
  installed_koRpus_lang <- grepl(pattern, installed_packages)

  have_koRpus_lang <- any(installed_koRpus_lang, available_koRpus_lang)

  if(isTRUE(have_koRpus_lang)){
    if(isTRUE(available.only)){
      all_packages <- supported_lang
    } else {
      all_packages <- unique(c(installed_packages[installed_koRpus_lang], supported_lang))
    }
    for (this_package in all_packages){
      result[[this_package]] <- list(available=NA, installed=FALSE, loaded=FALSE, title="(unknown)")
      if(all(isTRUE(available), any(supported_lang == this_package))){
        result[[this_package]][["available"]] <- TRUE
      } else {}
      if(any(installed_packages[installed_koRpus_lang] == this_package)){
        result[[this_package]][["installed"]] <- TRUE
        this_package_index <- which.min(!installed_packages %in% this_package)
        result[[this_package]][["title"]] <- utils::packageDescription(installed_packages[this_package_index])[["Title"]]
      } else {}
      if(any(loaded_packages[loaded_koRpus_lang] == this_package)){
        result[[this_package]][["loaded"]] <- TRUE
      } else {}
    }
  } else {}
  
  return(result)
} ## end function check_lang_packages()
unDocUMeantIt/sylly documentation built on Sept. 24, 2020, 11:30 a.m.