R/read.corp.LCC.R

Defines functions read.corp.LCC

Documented in read.corp.LCC

# Copyright 2010-2021 Meik Michalke <meik.michalke@hhu.de>
#
# This file is part of the R package koRpus.
#
# koRpus is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# koRpus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with koRpus.  If not, see <http://www.gnu.org/licenses/>.


#' Import LCC data
#'
#' Read data from LCC[1] formatted corpora (Quasthoff, Richter & Biemann, 2006).
#'
#' The LCC database can either be unpacked or still a .tar/.tar.gz/.zip archive. If the latter is the case, then
#' all necessary files will be extracted to a temporal location automatically, and by default removed again
#' when the function has finished reading from it.
#' 
#' Newer LCC archives no longer feature the \code{*-meta.txt} file, resulting in less meta informtion in the object.
#' In these cases, the total number of tokens is calculated as the sum of types' frequencies.
#'
#' @note Please note that MySQL support is not implemented yet.
#'
#' @param LCC.path A character string, either path to a .tar/.tar.gz/.zip file in LCC format (flatfile),
#'    or the path to the directory with the unpacked archive.
#' @param format Either "flatfile" or "MySQL", depending on the type of LCC data.
#' @param fileEncoding A character string naming the encoding of the LCC files. Old zip archives used "ISO_8859-1".
#'    This option will only influence the reading of meta information, as the actual database encoding is derived from
#'    there.
#' @param n An integer value defining how many lines of data should be read if \code{format="flatfile"}. Reads all at -1.
#' @param keep.temp Logical. If \code{LCC.path} is a tarred/zipped archive, setting \code{keep.temp=TRUE} will keep
#'    the temporarily unpacked files for further use. By default all temporary files will be removed when
#'    the function ends.
#' @param prefix Character string, giving the prefix for the file names in the archive. Needed for newer LCC tar archives
#'    if they are already decompressed (autodetected if \code{LCC.path} points to the tar archive directly).
#' @param bigrams Logical, whether infomration on bigrams should be imported.
#'    This is \code{FALSE} by default, because it might make the objects quite large.
#'    Note that this will only work in \code{n = -1} because otherwise the tokens cannot be looked up.
#' @param cooccurence Logical, like \code{bigrams}, but for information on co-occurences of tokens in a sentence.
#' @param caseSens Logical, if \code{FALSE} forces all frequency statistics to be calculated regardless of the tokens' case.
#'    Otherwise, if the imported database supports it, you will get different frequencies for the same tokens in different
#'    cases (e.\,g., "one" and "One").
#' @return An object of class \code{\link[koRpus:kRp.corp.freq-class]{kRp.corp.freq}}.
# @author m.eik michalke \email{meik.michalke@@hhu.de}
#' @keywords corpora
#' @seealso \code{\link[koRpus:kRp.corp.freq-class]{kRp.corp.freq}}
#' @references Quasthoff, U., Richter, M. & Biemann, C. (2006). Corpus Portal for Search in Monolingual Corpora, In
#'    \emph{Proceedings of the Fifth International Conference on Language Resources and Evaluation}, Genoa, 1799--1802.
#'
#'    [1] \url{https://wortschatz.uni-leipzig.de/en/download/}
#' @export
#' @examples
#' \dontrun{
#' # old format .zip archive
#' my.LCC.data <- read.corp.LCC(
#'   file.path("~","mydata","corpora","de05_3M.zip")
#' )
#' # new format tar archive
#' my.LCC.data <- read.corp.LCC(
#'   file.path("~","mydata","corpora","rus_web_2002_300K-text.tar")
#' )
#' # in case the tar archive was already unpacked
#' my.LCC.data <- read.corp.LCC(
#'   file.path("~","mydata","corpora","rus_web_2002_300K-text"),
#'   prefix="rus_web_2002_300K-"
#' )
#' freq.analysis(
#'   tokenized.obj,
#'   corp.freq=my.LCC.data
#' )
#' }

read.corp.LCC <- function(LCC.path, format="flatfile", fileEncoding="UTF-8", n=-1, keep.temp=FALSE, prefix=NULL, bigrams=FALSE, cooccurence=FALSE, caseSens=TRUE){

  # basic checks before we even proceed...
  if(identical(format, "flatfile")){
    # check if LCC.path is a zip file
    if(!as.logical(file_test("-d", LCC.path))){
      if(file.exists(LCC.path)){
        if(any(bigrams, cooccurence)){
          lookForFiles <- "(words|meta|co_n|co_s).txt$"
        } else {
          lookForFiles <- "(words|meta).txt$"
        }
        # prepare temporary location to later alter variable to use that location
        tmp.path <- tempfile("koRpus.LCC")
        if(!dir.create(tmp.path, recursive=TRUE)) stop(simpleError("Can't create temporary directory!"))
        if(isTRUE(keep.temp)){
          message(paste("Unpacked data will be kept in\n", tmp.path))
        } else {
          # if the function is done, remove the tempdir
          on.exit(unlink(tmp.path, recursive=TRUE))
        }
        # old zip format
        if(grepl("(\\.zip|\\.ZIP)$", LCC.path)){
          # seems to be an existing zip file
          message("Unzipping LCC archive... ", appendLF=FALSE)
          # to save time and diskspace, unpack only the needed files
          LCC.zip.content <- as.character(unzip(LCC.path, list=TRUE)$Name)
          LCC.zip.wanted <- LCC.zip.content[grep(lookForFiles, LCC.zip.content)]
          unzip(LCC.path, files=LCC.zip.wanted, junkpaths=TRUE, exdir=tmp.path)
          message("done.")
        } else if(grepl("(\\.tar|\\.tar.gz|\\.tgz)$", tolower(LCC.path))){
          # seems to be an existing tar file
          # compression should be detected automatically
          message("Fetching needed files from LCC archive... ", appendLF=FALSE)
          # to save time and diskspace, unpack only the needed files
          LCC.tar.content <- as.character(untar(LCC.path, list=TRUE))
          LCC.tar.wanted <- LCC.tar.content[grep(lookForFiles, LCC.tar.content)]
          if(is.null(prefix)){
            prefix <- gsub(lookForFiles, "", LCC.tar.wanted)[1]
          } else {}
          untar(LCC.path, files=LCC.tar.wanted, exdir=tmp.path)
          message("done.")
        } else {
          stop(simpleError(paste("Unknown LCC data format:", LCC.path)))
        }
        # the database should now be in tmp.path
        # alter variable to use the temporary location
        LCC.path <- tmp.path
      } else {
        stop(simpleError(paste("Cannot access LCC data:", LCC.path)))
      }
    } else {
      # valid path?
      check.file(LCC.path, mode="dir")
    }

    have <- c()
    LCC.files <- c()
    for (thisFile in c("meta", "co_n", "co_s")){
      # does thisFile.txt exist?
      LCC.files[thisFile] <- file.path(LCC.path, paste0(prefix, thisFile, ".txt"))
      have[thisFile] <- check.file(LCC.files[thisFile], mode="exist", stopOnFail=FALSE)
    }
    # does words.txt exist?
    LCC.files["words"] <- file.path(LCC.path, paste0(prefix, "words.txt"))
    check.file(LCC.files["words"], mode="exist")
  } else if(identical(format, "MySQL")){
    stop(simpleMessage("Sorry, not implemented yet..."))
  } else {
    stop(simpleError(paste("Unknown format:", format)))
  }

  ## here we go!
  # TODO: need another if(identical(format, "flatfile")) here when MySQL is implemented?
  dscrpt.meta <- table.meta <- NULL
  num.running.words <- NA
  if(isTRUE(have[["meta"]])){
    table.meta <- read.delim(LCC.files["meta"], header=FALSE, col.names=c(1,"meta","value"),
      strip.white=TRUE, fill=FALSE, stringsAsFactors=FALSE, fileEncoding=fileEncoding)[,-1]
    # check the format type
    if(all(c("number of distinct word forms", "average sentence length in characters") %in% table.meta[,1])){
      LCC.archive.format <- "zip"
    } else if(all(c("SENTENCES", "WORD_TOKENS", "WORD_TYPES") %in% table.meta[,1])){
      LCC.archive.format <- "tar"
    } else {
      stop(simpleError("Sorry, this format is not supported! Please contact the authors."))
    }
    if(identical(LCC.archive.format, "zip")){
      num.distinct.words  <- as.numeric(table.meta[table.meta[,1] == "number of distinct word forms", 2])
      num.running.words   <- as.numeric(table.meta[table.meta[,1] == "number of running word forms", 2])
      avg.sntclgth.words  <- as.numeric(table.meta[table.meta[,1] == "average sentence length in words", 2])
      avg.sntclgth.chars  <- as.numeric(table.meta[table.meta[,1] == "average sentence length in characters", 2])
      avg.wrdlgth.form    <- as.numeric(table.meta[table.meta[,1] == "average word form length", 2])
      avg.wrdlgth.running <- as.numeric(table.meta[table.meta[,1] == "average running word length", 2])
      fileEncoding        <- table.meta[table.meta[,1] == "database encoding", 2]
    } else if(identical(LCC.archive.format, "tar")) {
      num.distinct.words  <- as.numeric(table.meta[table.meta[,1] == "WORD_TYPES", 2])
      num.running.words   <- as.numeric(table.meta[table.meta[,1] == "WORD_TOKENS", 2])
      avg.sntclgth.words <- avg.sntclgth.chars  <- avg.wrdlgth.form <- avg.wrdlgth.running <- NA
      fileEncoding        <- table.meta[table.meta[,1] == "database encoding", 2]
    }

    dscrpt.meta <- data.frame(
      tokens=num.running.words,
      types=num.distinct.words,
      words.p.sntc=avg.sntclgth.words,
      chars.p.sntc=avg.sntclgth.chars,
      chars.p.wform=avg.wrdlgth.form,
      chars.p.word=avg.wrdlgth.running)
  } else {}

  # checking n-grams
  table.cooccur <- table.bigrams <- NULL
  if(any(bigrams, cooccurence) & !identical(n, -1)){
    warning("Importing bigrams and co-occurrences is only possible while 'n = -1'!")
  } else {}
  if(isTRUE(bigrams) & identical(n, -1)){
    # bigrams
    if(isTRUE(have[["co_n"]])){
      message("Importing bigrams... ", appendLF=FALSE)
      LCC.file.con <- file(LCC.files[["co_n"]], open="r")
      rL.words <- readLines(LCC.file.con, encoding=fileEncoding)
      close(LCC.file.con)
      table.bigrams <- matrix(unlist(strsplit(rL.words, "\t")), ncol=4, byrow=TRUE, dimnames=list(c(),c("token1","token2","freq","sig")))
      rm(rL.words)
      message("done.")
    } else {
      warning("'bigrams' is TRUE, but no *-co_n.txt file was found in the LCC archive!")
    }
  } else {}
  if(isTRUE(cooccurence) & identical(n, -1)){
    # co-occrrence in one sentence
    if(isTRUE(have[["co_s"]])){
      message("Importing co-occrrence in one sentence... ", appendLF=FALSE)
      LCC.file.con <- file(LCC.files[["co_s"]], open="r")
      rL.words <- readLines(LCC.file.con, encoding=fileEncoding)
      close(LCC.file.con)
      table.cooccur <- matrix(unlist(strsplit(rL.words, "\t")), ncol=4, byrow=TRUE, dimnames=list(c(),c("token1","token2","freq","sig")))
      rm(rL.words)
      message("done.")
    } else {
      warning("'cooccurence' is TRUE, but no *-co_s.txt file was found in the LCC archive!")
    }
  } else {}
  # LCC files can be veeeery large. if so, reading them will most likely freeze R
  # as a precaution we'll therefore use a file connection and readLines()
  LCC.file.con <- file(LCC.files[["words"]], open="r")
  rL.words <- readLines(LCC.file.con, n=n, encoding=fileEncoding)
  close(LCC.file.con)
  # newer archives have four instead of three columns, check for this
  words.num.cols <- length(unlist(strsplit(rL.words[1], "\t")))
  if(isTRUE(words.num.cols == 3)){
    table.words <- matrix(unlist(strsplit(rL.words, "\t")), ncol=3, byrow=TRUE, dimnames=list(c(),c("num","word","freq")))
  } else if(isTRUE(words.num.cols == 4)){
    table.words <- matrix(unlist(strsplit(rL.words, "\t")), ncol=4, byrow=TRUE, dimnames=list(c(),c("num","word","word2","freq")))
    rm(rL.words)
    # usually, the third row is a duplicate of the second, then it's safe to drop one
    if(!identical(table.words[,2], table.words[,3])){
      warning(
        paste0(
          "This looks like a newer LCC archive with four columns in the *-words.txt file.\n",
          "The two word columns did not match, but we'll only use the first one!"
        )
      )
    } else {}
    table.words <- table.words[,c("num","word","freq")]
  } else {
    stop(simpleError(
      paste0("It seems the LCC archove format has changed:\n",
        "  koRpus supports *-words.txt files with 3 or 4 columns, found ", words.num.cols, ".\n",
        "  Please inform the package author(s) so that the problem get's fixed soon!"
      )
    ))
  }

  # call internal function create.corp.freq.object()
  results <- create.corp.freq.object(
              matrix.freq=table.words,
              num.running.words=num.running.words,
              df.meta=table.meta,
              df.dscrpt.meta=dscrpt.meta,
              matrix.table.bigrams=table.bigrams,
              matrix.table.cooccur=table.cooccur,
              caseSens=caseSens
            )

  return(results)
}
unDocUMeantIt/koRpus documentation built on May 21, 2021, 9:26 p.m.