R/lda.R
In GermaParl: Download and Augment the Corpus of Plenary Protocols of the German Bundestag

Documented in germaparl_download_lda germaparl_load_lda

#' @include download.R GermaParl.R
NULL

#' Use topicmodels prepared for GermaParl.
#' 
#' A set of LDA topicmodels is part of the Zenodo release of GermaParl (k
#' between 100 and 450). These topic models can be downloaded using
#' \code{germaparl_download_lda} and loaded using \code{germaparl_load_lda}.
#' 
#' @details The function \code{germaparl_download_lda} will download an
#'   \code{rds}-file that will be stored in the data directory of the GermaParl
#'   corpus.
#' @param k A \code{numeric} or \code{integer} vector, the number of topics of
#'   the topicmodel. Multiple values can be provided to download several topic
#'   models at once.
#' @param doi The DOI of GermaParl at Zenodo.
#' @param registry_dir The registry directory where the registry file for GERMAPARL
#'   is located.
#' @param data_dir The data directory with the binary files of the GERMAPARL
#'   corpus. If missing, the directory will be guessed using the function
#'   \code{cwb::cwb_corpus_dir}
#' @param sample A \code{logical} value, if \code{TRUE}, use GERMAPARLSAMPLE
#'   corpus rather than GERMAPARL.
#' @param verbose A \code{logical} value, whether to show status messages.
#' @return The function \code{germaparl_download_lda} will (invisibly) return
#'   \code{TRUE} if the operation has been succesful and \code{FALSE} if not.
#' @export germaparl_download_lda
#' @importFrom zen4R ZenodoManager
#' @aliases topics
#' @rdname germaparl_topics
#' @importFrom utils download.file
#' @importFrom tools md5sum
#' @importFrom cli cat_rule  cli_process_start cli_process_done cli_alert_info
#'   cli_process_failed cli_alert_danger
#' @examples
#' # This example assumes that the directories used by the CWB do not yet exist, so
#' # temporary directories are created.
#' cwb_dirs <- cwbtools::create_cwb_directories(prefix = tempdir(), ask = FALSE)
#' 
#' samplemode <- TRUE
#' corpus_id <- "GERMAPARLSAMPLE" # for full corpus: corpus_id <- "GERMAPARL"
#' 
#' dir.create(file.path(cwb_dirs[["corpus_dir"]], tolower(corpus_id)))
#' 
#' # Download topic model
#' germaparl_download_lda(
#'   k = 30, # k = 250 recommended for full GERMAPARL corpus
#'   data_dir = file.path(cwb_dirs[["corpus_dir"]], tolower(corpus_id)),
#'   sample = samplemode
#' )
#' lda <- germaparl_load_lda(
#'   k = 30L, registry_dir = cwb_dirs[["registry_dir"]],
#'   sample = samplemode
#' )
#' lda_terms <- topicmodels::terms(lda, 10)
germaparl_download_lda <- function(
  k = c(100L, 150L, 175L, 200L, 225L, 250L, 275L, 300L, 350L, 400L, 450L),
  doi = "10.5281/zenodo.3742113",
  data_dir,
  sample = FALSE,
  verbose = TRUE
  ){
  
  if (isTRUE(sample)) doi <- "10.5281/zenodo.3823245"
  corpus_id <- if (isFALSE(sample)) "GERMAPARL" else "GERMAPARLSAMPLE"
  if (missing(data_dir)) data_dir <- file.path(cwb_corpus_dir(), tolower(corpus_id))
  if (!is.numeric(k)) stop("Argument k is required to be a numeric vector.")
  if (length(k) > 1L){
    sapply(1L:length(k), function(i) germaparl_download_lda(k = k[i], doi = doi, verbose = verbose))
  } else {
    if (verbose) cli_process_start(sprintf("get Zenodo record for doi %s", doi) )
    tryCatch(
      zenodo_record <- ZenodoManager$new()$getRecordByDOI(doi = doi),
      error = function(e) if (verbose) cli_process_failed() else cli_alert_danger(sprintf("Zenodo record not available"))
    )
    if (!exists("zenodo_record")){
      return(invisible(FALSE))
    } else if (is.null(zenodo_record)){
      cli_alert_danger(sprintf("no Zenodo record found for DOI %s", doi))
      return(invisible(FALSE))
    } else {
      if (verbose) cli_process_done()
    }
    
    zenodo_files <- sapply(zenodo_record[["files"]], function(x) basename(x[["links"]][["download"]]))
    filename_regex <- sprintf("^%s_lda_.*?%d\\.rds$", tolower(corpus_id), k)
    file_matching <- grep(filename_regex, zenodo_files)
    if (length(file_matching) == 0L){
      cli_alert_danger(sprintf("No file available at Zenodo repository matching regex '%s'.", filename_regex))
      return(FALSE)
    } else if (length(file_matching) > 1L){
      cli_alert_danger("FAIL - more than one potential file candiddate for download!")
      return(FALSE)
    } else if (length(file_matching) == 1L){
      lda_remote <- zenodo_record[["files"]][[file_matching]][["links"]][["download"]]
      if (verbose) cli_alert_info("starting to download LDA model")
      lda_local <- file.path(data_dir, basename(lda_remote))
      tryCatch(
        download_success <- download.file(url = lda_remote, destfile = lda_local),
        error = function(e) cli_alert_danger(sprintf("cannot download file from %s", lda_remote))
      )
      if (!exists("download_success")) return(invisible(FALSE))
      if (download_success != 0){
        cli_alert_danger(sprintf("downloading file from %s has failed", lda_remote))
        return(invisible(FALSE))
      }
      if (verbose) cli_process_start(sprintf("check md5 checksum for downloaded file %s", basename(lda_local)))
      lda_file_local_md5 <- tools::md5sum(lda_local)
      if (lda_file_local_md5 == zenodo_record[["files"]][[file_matching]][["checksum"]]){
        if (verbose) cli_process_done()
        return(invisible(TRUE))
      } else {
        if (verbose) cli_process_failed()
        cli_alert_danger(
          sprintf(
            "md5sum (%s) of file '%s' does not match Zenodo archive md5sum (%s)",
            lda_file_local_md5, basename(lda_local)
          )
        )
        return(FALSE)
      }
    } 
  }
  invisible(TRUE)
}


#' @details \code{germaparl_load_lda} will load a topicmodel into memory.
#'   The function will return a \code{LDA_Gibbs} topicmodel, if the topicmodel
#'   for \code{k} is present; \code{NULL} if the topicmodel has not yet been
#'   downloaded.
#' @param verbose logical
#' @export germaparl_load_lda
#' @importFrom cwbtools registry_file_parse cwb_registry_dir
#' @rdname germaparl_topics
germaparl_load_lda <- function(k, registry_dir = cwbtools::cwb_registry_dir(), verbose = TRUE, sample = FALSE){
  corpus_id <- if (isFALSE(sample)) "GERMAPARL" else "GERMAPARLSAMPLE" 
  if (verbose) message(sprintf("... loading topicmodel for k = %d", k))
  if (file.exists(file.path(registry_dir, tolower(corpus_id)))){
    topicmodel_dir <- registry_file_parse(corpus = tolower(corpus_id), registry_dir = registry_dir)[["home"]]
  } else {
    topicmodel_dir <- file.path(dirname(registry_dir), "indexed_corpora", tolower(corpus_id))
    if (!dir.exists(topicmodel_dir)){
      stop("Cannot guess directory for topicmodels.")
    }
  }
  lda_files <- Sys.glob(paths = sprintf("%s/%s_lda_*.rds", topicmodel_dir, tolower(corpus_id)))
  ks <- as.integer(gsub(sprintf("%s_lda_.*?(\\d+)\\.rds$", tolower(corpus_id)), "\\1", basename(lda_files)))
  if (!k %in% ks){
    warning("no topicmodel available for k provided")
    return(NULL)
  }
  names(lda_files) <- ks
  readRDS(lda_files[[as.character(k)]])
}

Any scripts or data that you put into this service are public.

GermaParl documentation built on Oct. 23, 2020, 8:27 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

GermaParl
Download and Augment the Corpus of Plenary Protocols of the German Bundestag

R/lda.R
In GermaParl: Download and Augment the Corpus of Plenary Protocols of the German Bundestag

Defines functions germaparl_load_lda germaparl_download_lda

Documented in germaparl_download_lda germaparl_load_lda

Try the GermaParl package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GermaParl Download and Augment the Corpus of Plenary Protocols of the German Bundestag

R/lda.R In GermaParl: Download and Augment the Corpus of Plenary Protocols of the German Bundestag

Defines functions germaparl_load_lda germaparl_download_lda

Documented in germaparl_download_lda germaparl_load_lda

Try the GermaParl package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GermaParl
Download and Augment the Corpus of Plenary Protocols of the German Bundestag

R/lda.R
In GermaParl: Download and Augment the Corpus of Plenary Protocols of the German Bundestag