R/lda.R

Defines functions germaparl_load_lda germaparl_download_lda

Documented in germaparl_download_lda germaparl_load_lda

#' @include download.R GermaParl.R
NULL

#' Use topicmodels prepared for GermaParl.
#' 
#' A set of LDA topicmodels is part of the Zenodo release of GermaParl (k
#' between 100 and 450). These topic models can be downloaded using
#' \code{germaparl_download_lda} and loaded using \code{germaparl_load_lda}.
#' 
#' @details The function \code{germaparl_download_lda} will download an
#'   \code{rds}-file that will be stored in the data directory of the GermaParl
#'   corpus.
#' @param k A \code{numeric} or \code{integer} vector, the number of topics of
#'   the topicmodel. Multiple values can be provided to download several topic
#'   models at once.
#' @param doi The DOI of GermaParl at Zenodo.
#' @param registry_dir The registry directory where the registry file for GERMAPARL
#'   is located.
#' @param data_dir The data directory with the binary files of the GERMAPARL
#'   corpus. If missing, the directory will be guessed using the function
#'   \code{cwb::cwb_corpus_dir}
#' @param sample A \code{logical} value, if \code{TRUE}, use GERMAPARLSAMPLE
#'   corpus rather than GERMAPARL.
#' @param verbose A \code{logical} value, whether to show status messages.
#' @return The function \code{germaparl_download_lda} will (invisibly) return
#'   \code{TRUE} if the operation has been succesful and \code{FALSE} if not.
#' @export germaparl_download_lda
#' @importFrom zen4R ZenodoManager
#' @aliases topics
#' @rdname germaparl_topics
#' @importFrom utils download.file
#' @importFrom tools md5sum
#' @importFrom cli cat_rule  cli_process_start cli_process_done cli_alert_info
#'   cli_process_failed cli_alert_danger
#' @examples
#' # This example assumes that the directories used by the CWB do not yet exist, so
#' # temporary directories are created.
#' cwb_dirs <- cwbtools::create_cwb_directories(prefix = tempdir(), ask = FALSE)
#' 
#' samplemode <- TRUE
#' corpus_id <- "GERMAPARLSAMPLE" # for full corpus: corpus_id <- "GERMAPARL"
#' 
#' dir.create(file.path(cwb_dirs[["corpus_dir"]], tolower(corpus_id)))
#' 
#' # Download topic model
#' germaparl_download_lda(
#'   k = 30, # k = 250 recommended for full GERMAPARL corpus
#'   data_dir = file.path(cwb_dirs[["corpus_dir"]], tolower(corpus_id)),
#'   sample = samplemode
#' )
#' lda <- germaparl_load_lda(
#'   k = 30L, registry_dir = cwb_dirs[["registry_dir"]],
#'   sample = samplemode
#' )
#' lda_terms <- topicmodels::terms(lda, 10)
germaparl_download_lda <- function(
  k = c(100L, 150L, 175L, 200L, 225L, 250L, 275L, 300L, 350L, 400L, 450L),
  doi = "10.5281/zenodo.3742113",
  data_dir,
  sample = FALSE,
  verbose = TRUE
  ){
  
  if (isTRUE(sample)) doi <- "10.5281/zenodo.3823245"
  corpus_id <- if (isFALSE(sample)) "GERMAPARL" else "GERMAPARLSAMPLE"
  if (missing(data_dir)) data_dir <- file.path(cwb_corpus_dir(), tolower(corpus_id))
  if (!is.numeric(k)) stop("Argument k is required to be a numeric vector.")
  if (length(k) > 1L){
    sapply(1L:length(k), function(i) germaparl_download_lda(k = k[i], doi = doi, verbose = verbose))
  } else {
    if (verbose) cli_process_start(sprintf("get Zenodo record for doi %s", doi) )
    tryCatch(
      zenodo_record <- ZenodoManager$new()$getRecordByDOI(doi = doi),
      error = function(e) if (verbose) cli_process_failed() else cli_alert_danger(sprintf("Zenodo record not available"))
    )
    if (!exists("zenodo_record")){
      return(invisible(FALSE))
    } else if (is.null(zenodo_record)){
      cli_alert_danger(sprintf("no Zenodo record found for DOI %s", doi))
      return(invisible(FALSE))
    } else {
      if (verbose) cli_process_done()
    }
    
    zenodo_files <- sapply(zenodo_record[["files"]], function(x) basename(x[["links"]][["download"]]))
    filename_regex <- sprintf("^%s_lda_.*?%d\\.rds$", tolower(corpus_id), k)
    file_matching <- grep(filename_regex, zenodo_files)
    if (length(file_matching) == 0L){
      cli_alert_danger(sprintf("No file available at Zenodo repository matching regex '%s'.", filename_regex))
      return(FALSE)
    } else if (length(file_matching) > 1L){
      cli_alert_danger("FAIL - more than one potential file candiddate for download!")
      return(FALSE)
    } else if (length(file_matching) == 1L){
      lda_remote <- zenodo_record[["files"]][[file_matching]][["links"]][["download"]]
      if (verbose) cli_alert_info("starting to download LDA model")
      lda_local <- file.path(data_dir, basename(lda_remote))
      tryCatch(
        download_success <- download.file(url = lda_remote, destfile = lda_local),
        error = function(e) cli_alert_danger(sprintf("cannot download file from %s", lda_remote))
      )
      if (!exists("download_success")) return(invisible(FALSE))
      if (download_success != 0){
        cli_alert_danger(sprintf("downloading file from %s has failed", lda_remote))
        return(invisible(FALSE))
      }
      if (verbose) cli_process_start(sprintf("check md5 checksum for downloaded file %s", basename(lda_local)))
      lda_file_local_md5 <- tools::md5sum(lda_local)
      if (lda_file_local_md5 == zenodo_record[["files"]][[file_matching]][["checksum"]]){
        if (verbose) cli_process_done()
        return(invisible(TRUE))
      } else {
        if (verbose) cli_process_failed()
        cli_alert_danger(
          sprintf(
            "md5sum (%s) of file '%s' does not match Zenodo archive md5sum (%s)",
            lda_file_local_md5, basename(lda_local)
          )
        )
        return(FALSE)
      }
    } 
  }
  invisible(TRUE)
}


#' @details \code{germaparl_load_lda} will load a topicmodel into memory.
#'   The function will return a \code{LDA_Gibbs} topicmodel, if the topicmodel
#'   for \code{k} is present; \code{NULL} if the topicmodel has not yet been
#'   downloaded.
#' @param verbose logical
#' @export germaparl_load_lda
#' @importFrom cwbtools registry_file_parse cwb_registry_dir
#' @rdname germaparl_topics
germaparl_load_lda <- function(k, registry_dir = cwbtools::cwb_registry_dir(), verbose = TRUE, sample = FALSE){
  corpus_id <- if (isFALSE(sample)) "GERMAPARL" else "GERMAPARLSAMPLE" 
  if (verbose) message(sprintf("... loading topicmodel for k = %d", k))
  if (file.exists(file.path(registry_dir, tolower(corpus_id)))){
    topicmodel_dir <- registry_file_parse(corpus = tolower(corpus_id), registry_dir = registry_dir)[["home"]]
  } else {
    topicmodel_dir <- file.path(dirname(registry_dir), "indexed_corpora", tolower(corpus_id))
    if (!dir.exists(topicmodel_dir)){
      stop("Cannot guess directory for topicmodels.")
    }
  }
  lda_files <- Sys.glob(paths = sprintf("%s/%s_lda_*.rds", topicmodel_dir, tolower(corpus_id)))
  ks <- as.integer(gsub(sprintf("%s_lda_.*?(\\d+)\\.rds$", tolower(corpus_id)), "\\1", basename(lda_files)))
  if (!k %in% ks){
    warning("no topicmodel available for k provided")
    return(NULL)
  }
  names(lda_files) <- ks
  readRDS(lda_files[[as.character(k)]])
}

Try the GermaParl package in your browser

Any scripts or data that you put into this service are public.

GermaParl documentation built on Oct. 23, 2020, 8:27 p.m.