R/init.r

Defines functions build_ecotox_sqlite download_ecotox_data get_ecotox_path check_ecotox_availability .get_ecotox_url get_ecotox_url

Documented in build_ecotox_sqlite check_ecotox_availability download_ecotox_data get_ecotox_path get_ecotox_url

#' Get ECOTOX download URL from EPA website
#'
#' `r lifecycle::badge('stable')` This function downloads the webpage at <https://cfpub.epa.gov/ecotox/index.cfm>. It then searches for the
#' download link for the complete ECOTOX database and extract its URL.
#'
#' This function is called by [download_ecotox_data()] which tries to download the file from the resulting
#' URL. On some machines this fails due to issues with the SSL certificate. The user can try to download the file
#' by using this URL in a different browser (or on a different machine). Alternatively, the user could try to use
#' `[download_ecotox_data](verify_ssl = FALE)` when the download URL is trusted.
#' @param verify_ssl When set to `FALSE` the SSL certificate of the host (EPA)
#' is not verified. Can also be set as option:
#' `options(ECOTOXr_verify_ssl = TRUE)`. Default is `TRUE`.
#' @param ... arguments passed on to [httr::GET()]
#' @returns Returns a `character` string containing the download URL of the latest version of the EPA ECOTOX
#' database.
#' @rdname get_ecotox_url
#' @name get_ecotox_url
#' @examples
#' \dontrun{
#' get_ecotox_url()
#' }
#' @author Pepijn de Vries
#' @export
get_ecotox_url <- function(verify_ssl = getOption("ECOTOXr_verify_ssl"), ...) {
  if (is.null(verify_ssl)) verify_ssl <- TRUE
  args <- list(...)
  if (!verify_ssl)
    args[["config"]] <- httr::config(ssl_verifyhost = 0, ssl_verifypeer = 0)
  link <- tryCatch({
    do.call(.get_ecotox_url, c(url = "https://cfpub.epa.gov/ecotox/index.cfm",
                               args))
  }, error = function(e) {
    do.call(.get_ecotox_url, c(url = "https://gaftp.epa.gov/ecotox/",
                               args))
  })
  return(link)
}

.get_ecotox_url <- function(url, ...) {
  link <- 
    httr::GET(url, ...) %>%
    rvest::read_html() %>%
    rvest::html_elements("a") %>%
    rvest::html_attr("href")
  link <- link[!is.na(link) & endsWith(link, ".zip")]
  if (length(link) == 0) stop("Could not find ASCII download link...")
  link[!startsWith(link, "http")] <- paste0(url, link[!startsWith(link, "http")])
  link_dates <-
    stringr::str_sub(link, -14, -5) %>%
    as.Date(format = "%m_%d_%Y")
  link[which(link_dates == max(link_dates))]
}

#' Check whether a ECOTOX database exists locally
#'
#' `r lifecycle::badge('stable')` Tests whether a local copy of the US EPA ECOTOX database exists in
#'  [get_ecotox_path()].
#'
#' When arguments are omitted, this function will look in the default directory ([get_ecotox_path()]).
#' However, it is possible to build a database file elsewhere if necessary.
#' @param target A `character` string specifying the path where to look for the database file.
#' @returns Returns a `logical` value indicating whether a copy of the database exists. It also returns
#' a `files` attribute that lists which copies of the database are found.
#' @rdname check_ecotox_availability
#' @name check_ecotox_availability
#' @examples
#' check_ecotox_availability()
#' @author Pepijn de Vries
#' @export
check_ecotox_availability <- function(target = get_ecotox_path()) {
  files    <- list.files(target)
  file_reg <- gregexpr("(?<=^ecotox_ascii_)(.*?)(?=\\.sqlite$)", files, perl = T)
  file_reg <- regmatches(files, file_reg)

  files    <- files[unlist(lapply(file_reg, length)) > 0]
  file_reg <- unlist(file_reg[unlist(lapply(file_reg, length)) > 0])
  if (any(nchar(file_reg) > 0)) {
    file_reg <- as.Date(file_reg, format = "%m_%d_%Y")
    files    <- files[!is.na(file_reg)]
    file_reg <- file_reg[!is.na(file_reg)]
  } else {
    file_reg <- as.Date(NA)[-1]
    target   <- character(0)
  }
  result <- length(files) > 0
  attributes(result)$files <- data.frame(path = target, database = files, date = file_reg,
                                         stringsAsFactors = F)
  return(result)
}

#' The local path to the ECOTOX database (directory or sqlite file)
#'
#' `r lifecycle::badge('stable')` Obtain the local path to where the ECOTOX database is
#' (or will be) placed.
#'
#' It can be useful to know where the database is located on your disk. This function
#' returns the location as provided by [rappdirs::app_dir()], or as
#' specified by you using `options(ECOTOXr_path = "mypath")`.
#'
#' @param path When you have a copy of the database somewhere other than the default
#' directory ([get_ecotox_path()]), you can provide the path here.
#' @param version A `character` string referring to the release version of the database you wish to locate.
#' It should have the same format as the date in the EPA download link, which is month, day, year, separated by
#' underscores ("%m_%d_%Y"). When missing, the most recent available copy is selected automatically.
#' @returns Returns a `character` string of the path.
#' `get_ecotox_path` will return the default directory of the database.
#' `get_ecotox_sqlite_file` will return the path to the sqlite file when it exists.
#' @rdname get_path
#' @name get_ecotox_path
#' @examples
#' get_ecotox_path()
#'
#' \dontrun{
#' ## This will only work if a local database exists:
#' get_ecotox_sqlite_file()
#' }
#' @author Pepijn de Vries
#' @export
get_ecotox_path <- function() {
  getOption("ECOTOXr_path", rappdirs::app_dir("ECOTOXr")$cache())
}

#' Download and extract ECOTOX database files and compose database
#'
#' `r lifecycle::badge('stable')` In order for this package to fully function, a local copy of the ECOTOX database
#'  needs to be build. This function will download the required data and build the database.
#'
#' This function will attempt to find the latest download url for the ECOTOX database from the 
#' [EPA website](https://cfpub.epa.gov/ecotox/index.cfm) (see [get_ecotox_url()]).
#' When found it will attempt to download the zipped archive containing all required data. This data is then
#' extracted and a local copy of the database is build.
#'
#' Use '[suppressMessages()]' to suppress the progress report.
#' @section Known issues:
#' On some machines this function fails to connect to the database download URL from the
#' [EPA website](https://cfpub.epa.gov/ecotox/index.cfm) due to missing
#' SSL certificates. Unfortunately, there is no easy fix for this in this package. A work around is to download and
#' unzip the file manually using a different machine or browser that is less strict with SSL certificates. You can
#' then call [build_ecotox_sqlite()] and point the `source` location to the manually extracted zip
#' archive. For this purpose [get_ecotox_url()] can be used. Alternatively, one could try to call [download_ecotox_data()]
#' by setting `verify_ssl = FALSE`; but only do so when you trust the download URL from [get_ecotox_URL()].
#'
#' @param target Target directory where the files will be downloaded and the database compiled. Default is
#' [get_ecotox_path()].
#' @param write_log A `logical` value indicating whether a log file should be written to the target path
#  after building the SQLite database. See \code{\link{build_ecotox_sqlite}()} for more details. Default is
#' `TRUE`.
#' @param ask There are several steps in which files are (potentially) overwritten or deleted. In those cases
#' the user is asked on the command line what to do in those cases. Set this parameter to `FALSE` in order
#' to continue without warning and asking.
#' @inheritParams get_ecotox_url
#' @param ... Arguments passed on to [httr::GET()].
#' @returns Returns `NULL` invisibly.
#' @rdname download_ecotox_data
#' @name download_ecotox_data
#' @examples
#' \dontrun{
#' ## This will download and build the database in your temp dir:
#' download_ecotox_data(tempdir())
#' }
#' @author Pepijn de Vries
#' @export
download_ecotox_data <- function(
    target = get_ecotox_path(), write_log = TRUE, ask = TRUE,
    verify_ssl = getOption("ECOTOXr_verify_ssl"), ...) {
  if (is.null(verify_ssl)) verify_ssl <- TRUE
  
  avail <- check_ecotox_availability()
  if (avail && ask) {
    cat(sprintf("A local database already exists (%s).", paste(attributes(avail)$file$database, collapse = ", ")))
    prompt <- readline(prompt = "Do you wish to continue and potentially overwrite the existing database? (y/n) ")
    if (!startsWith("Y", toupper(prompt))) {
      message("Download aborted...\n")
      return(invisible(NULL))
    }
  }
  if (!dir.exists(target)) dir.create(target, recursive = T)
  ## Obtain download link from EPA website:
  message(crayon::white("Obtaining download link from EPA website... "))
  link <- get_ecotox_url(verify_ssl, ...)
  dest_path <- file.path(target, utils::tail(unlist(strsplit(link, "/")), 1))
  message(crayon::green("Done\n"))
  proceed.download <- T
  if (file.exists(dest_path) && ask) {
    prompt <- readline(prompt = sprintf("ECOTOX data is already present (%s), overwrite (y/n)? ", dest_path))
    proceed.download <- startsWith("Y", toupper(prompt))
  }
  if (proceed.download) {
    message(crayon::white(sprintf("Start downloading ECOTOX data from %s...\n", link)))
    cfg <- list(
      noprogress = 0L,
      progressfunction = function(down, up) {
        message(crayon::white(sprintf("\r%0.1f MB downloaded...",
                                      down[2]/(1024*1024))), appendLF = FALSE)
        TRUE
      })
    if (!verify_ssl) {
      cfg[["ssl_verifyhost"]] <- 0
      cfg[["ssl_verifypeer"]] <- 0
      }
    cfg <- do.call(httr::config, cfg)
    
    httr::GET(link, config = cfg,
              httr::write_disk(dest_path, overwrite = TRUE), ...)

    message(crayon::green(" Done\n"))
  }

  ## create bib-file for later reference
  con <- file(gsub(".zip", "_cit.txt", dest_path), "w+")
  release <- as.Date(stringr::str_sub(link, -15, -1), format = "_%m_%d_%Y.zip")
  writeLines(format(utils::bibentry(
    "misc",
    title        = format(release, "US EPA ECOTOXicology Database System Version 5.0 release %Y-%m-%d"),
    author       = utils::person(family = "US EPA", role = "aut"),
    year         = format(release, "%Y"),
    url          = link,
    howpublished = link,
    note         = format(Sys.Date(), "Accessed: %Y-%m-%d")), "R"), con)
  close(con)
  extr.path <- gsub(".zip", "", dest_path)
  proceed.unzip <- T
  if (dir.exists(extr.path)) {
    test.files <- list.files(extr.path)
    if (length(test.files) >= 12 && any(test.files == "chemical_carriers.txt") && ask) {
      cat("Extracted zip files already appear to exist.\n")
      prompt <- readline(prompt = "Continue unzipping and overwriting these files (y/n)? ")
      proceed.unzip <- startsWith("Y", toupper(prompt))
    }
  }
  if (proceed.unzip) {
    message(crayon::white("Extracting downloaded zip file... "))
    exdir     <- gsub(".zip", "", basename(link))
    file_list <- utils::unzip(file.path(target, utils::tail(unlist(strsplit(link, "/")), 1)), list = T)$Name
    if (all(startsWith(file_list, exdir))) exdir <- ""
    utils::unzip(file.path(target, utils::tail(unlist(strsplit(link, "/")), 1)),
                 exdir = file.path(target, exdir))
    message(crayon::green("Done\n"))
    if (ask &&
        startsWith("Y", toupper(readline(prompt = "Done extracting zip file, remove it to save disk space (y/n)? ")))) {
      message(crayon::white("Trying to delete zip file... "))
      tryCatch({
        file.remove(file.path(target, utils::tail(unlist(strsplit(link, "/")), 1)))
        message(crayon::green("Done\n"))
      }, error = function(e) {
        message(crayon::red("Failed to delete the file, continuing with next step"))
      })
    }
  }
  message(crayon::white("Start constructing SQLite database from downloaded tables...\n"))
  message(crayon::white("Note that this may take some time...\n"))
  build_ecotox_sqlite(extr.path, target, write_log)
  return(invisible(NULL))
}

#' Build an SQLite database from zip archived tables downloaded from EPA website
#'
#' `r lifecycle::badge('stable')` This function is called automatically after [download_ecotox_data()]. The database
#' files can also be downloaded manually from the [EPA website](https://cfpub.epa.gov/ecotox/) from which a local
#' database can be build using this function.
#'
#' Raw data downloaded from the EPA website is in itself not very efficient to work with in R. The files are large
#' and would put a large strain on R when loading completely into the system's memory. Instead use this function
#' to build an SQLite database from the tables. That way, the data can be queried without having to load it all into
#' memory.
#'
#' EPA provides the raw table from the [ECOTOX database](https://cfpub.epa.gov/ecotox/) as text files with
#' pipe-characters ('|') as table column separators. Although not documented, the tables appear not to contain comment
#' or quotation characters. There are records containing the reserved pipe-character that will confuse the table parser.
#' For these records, the pipe-character is replaced with a dash character ('-').
#'
#' In addition, while reading the tables as text files, this package attempts to decode the text as UTF8. Unfortunately,
#' this process appears to be platform-dependent, and may therefore result in different end-results on different platforms.
#' This problem only seems to occur for characters that are listed as 'control characters' under UTF8. This will have
#' consequences for reproducibility, but only if you build search queries that look for such special characters. It is
#' therefore advised to stick to common (non-accented) alpha-numerical characters in your searches, for the sake of
#' reproducibility.
#' 
#' Use '[suppressMessages()]' to suppress the progress report.
#'
#' @param source A `character` string pointing to the directory path where the text files with the raw
#' tables are located. These can be obtained by extracting the zip archive from <https://cfpub.epa.gov/ecotox/>
#' and look for 'Download ASCII Data'.
#' @param destination A `character` string representing the destination path for the SQLite file. By default
#' this is [get_ecotox_path()].
#' @param write_log A `logical` value indicating whether a log file should be written in the destination path
#  after building the SQLite database. See \code{\link{build_ecotox_sqlite}()} for more details. Default is
#' `TRUE`. The log contains information on the source and destination path, the version of this package,
#' the creation date, and the operating system on which the database was created.
#' @returns Returns `NULL` invisibly.
#' @rdname build_ecotox_sqlite
#' @name build_ecotox_sqlite
#' @examples
#' \dontrun{
#' ## This example will only work properly if 'dir' points to an existing directory
#' ## with the raw tables from the ECOTOX database. This function will be called
#' ## automatically after a call to 'download_ecotox_data()'.
#' test <- check_ecotox_availability()
#' if (test) {
#'   files   <- attributes(test)$files[1,]
#'   dir     <- gsub(".sqlite", "", files$database, fixed = T)
#'   path    <- files$path
#'   if (dir.exists(file.path(path, dir))) {
#'     ## This will build the database in your temp directory:
#'     build_ecotox_sqlite(source = file.path(path, dir), destination = tempdir())
#'   }
#' }
#' }
#' @author Pepijn de Vries
#' @export
build_ecotox_sqlite <- function(source, destination = get_ecotox_path(), write_log = TRUE) {
  dbname <- paste0(basename(source), ".sqlite")
  dbcon  <- RSQLite::dbConnect(RSQLite::SQLite(), file.path(destination, dbname))
  unexpected_fields <- character(0)
  missing_fields    <- character(0)
  missing_tables    <- character(0)
  incomplete_check  <- character(0)

  ## Loop the text file tables and add them to the sqlite database 1 by 1
  i <- 0

  by(.db_specs, .db_specs$table, function(tab) {
    i <<- i + 1
    message(crayon::white(sprintf("Adding '%s' table (%i/%i) to database:\n",
                                  tab$table[[1]], i, length(unique(.db_specs$table)))))
    filename <- file.path(source, paste0(tab$table[[1]], ".txt"))
    if (!file.exists(filename)) filename <- file.path(source, "validation", paste0(tab$table[[1]], ".txt"))
    if (!file.exists(filename)) {
      missing_tables <<- c(missing_tables, tab$table[[1]])
      message(stringr::str_pad(sprintf("\r File for table '%s' does not exist. This may occur for older ECOTOX releases\n",
                                    tab$table[[1]]),
                               width = 80, "right"))
      message(stringr::str_pad("\r Will try to continue without this table\n", width = 80, "right"))
      return(NULL)
    }
    
    ## Remove table from database if it already exists
    RSQLite::dbExecute(dbcon, sprintf("DROP TABLE IF EXISTS [%s];", tab$table[[1]]))

    ## specify query to create the table in the sqlite database
    foreign_keys <- tab[tab$foreign_key != "",, drop = F]
    if (nrow(foreign_keys) > 0) {
      foreign_keys <- apply(foreign_keys, 1, function(x) {
        sprintf("\tFOREIGN KEY(%s) REFERENCES [%s]", x[["field_name"]], x[["foreign_key"]])
      })
      foreign_keys <- paste(foreign_keys, collapse = ",\n")
    } else foreign_keys <- ""
    query <- tab[,names(tab) %in% c("field_name", "data_type", "primary_key", "not_null")]
    query[is.na(query)] <- ""
    query <- apply(query, 1, paste, collapse = " ")
    query <- paste(paste0("\t", trimws(query)), collapse = ",\n")
    if (foreign_keys != "") query <- paste0(query, ",\n", foreign_keys)
    query <- sprintf("CREATE TABLE [%s](\n%s\n);", tab$table[[1]], query)
    RSQLite::dbExecute(dbcon, query)

    head  <- NULL
    lines.read <- 1
    ## Copy tables in 50000 line fragments to database, to avoid memory issues
    frag.size  <- 50000
    message(crayon::white(sprintf("\r  0 lines (incl. header) of '%s' added to database", tab$table[[1]])),
            appendLF = FALSE)
    repeat {
      if (is.null(head)) {
        head <- iconv(readr::read_lines(filename, skip = 0, n_max = 1, progress = F), to = "UTF8", sub = "*")
      } else {
        testsize   <- ifelse(lines.read == 1, frag.size - 1, frag.size)
        ## readr sometimes generates warnings for possible parsing problems (inherited from vroom)
        ## however, running 'readr::problems' does not show any issues, muffle this warning
        ## if this is the case:
        body       <- withCallingHandlers({
          chunk <- readr::read_lines(filename, skip = lines.read, n_max = testsize, progress = F)
        }, warning = function(w) if (nrow(readr::problems(chunk)) == 0) rlang::cnd_muffle(w))
        body       <- suppressWarnings({iconv(body, to = "UTF8", sub = "*")})
        ## Some records incorrectly contain line feed characters. Replace with space:
        body       <- suppressWarnings({gsub("\U000D", " ", body)})
        ## Replace pipe-characters with dashes when they are between brackets "("and ")",
        ## These should not be interpreted as table separators and will mess up the table.read call
        body       <- stringr::str_replace_all(body, "(?<=\\().+?(?=\\))", function(x){
          ## there should not be another opening bracket, double pipe or forward slash! in that case leave as is
          if (grepl("[\\(/]", x) || grepl("||", x, fixed = T)) return(x)
          gsub("[|]", "-", x)
        })

        lines.read <- lines.read + length(body)

        ## Join lines when number of pipes is to small (probably caused by unintended linefeed)
        repeat{
          count_pipes <- unlist(lapply(regmatches(body, gregexpr("[|]", body)), length))
          join_lines  <- which(count_pipes < length(regmatches(head, gregexpr("[|]", head))[[1]]))[1:2]
          if (length(join_lines) > 0 & !any(is.na(join_lines))) {
            body        <- c(body[-join_lines], paste(body[join_lines], collapse = " "))
          } else break
        }

        ## strip.white is set to F, as they occur in primary keys!
        table.frag <- utils::read.table(text = c(head, body),
                                        sep = "|", header = TRUE, quote = "", comment.char = "",
                                        stringsAsFactors = FALSE, strip.white = FALSE)

        missing_cols    <- tab$field_name[!tab$field_name %in% colnames(table.frag)]
        unexpected_cols <- colnames(table.frag)[!colnames(table.frag) %in% tab$field_name]
        if (length(unexpected_cols) > 0)
          unexpected_fields <<- union(unexpected_fields, paste(tab$table[[1]], unexpected_cols, sep = "."))
        if (length(missing_cols) > 0)
          missing_fields    <<- union(missing_fields, paste(tab$table[[1]], missing_cols, sep = "."))
        
        if (RSQLite::dbExistsTable(dbcon, tab$table[[1]]) && ("PRIMARY KEY" %in% tab$primary_key)) {
          prim_key <- which(tab$primary_key == "PRIMARY KEY")
          RSQLite::dbWriteTable(dbcon, "temp",
                                table.frag[,setdiff(tab$field_name, missing_cols), drop = FALSE], overwrite = TRUE)
          
          ## When the primary key is not unique, update the table using the last occurrence of the primary key.
          updt <- dbplyr::sql_query_upsert(dbcon, tab$table[[1]], dplyr::tbl(dbcon, "temp"), by = names(table.frag)[prim_key],
                                           update_cols = names(table.frag)[-prim_key])
          written_len <- RSQLite::dbExecute(dbcon, updt)
          if (written_len < nrow(table.frag)) {
            message(
              stringr::str_pad(
                sprintf("\r Table '%s' contains less records than read from source. Likely cause: duplicate records in source.\n",
                        tab$table[[1]]),
                width = 80, "right")
            )
            incomplete_check <- union(incomplete_check, tab$table[[1]])
          }
          invisible(RSQLite::dbExecute(dbcon, "DROP TABLE IF EXISTS temp;"))
        } else {
          RSQLite::dbWriteTable(dbcon, tab$table[[1]],
                                table.frag[,setdiff(tab$field_name, missing_cols), drop = FALSE], append = TRUE)
        }

        message(crayon::white(sprintf("\r %i lines (incl. header) of '%s' added to database", lines.read, tab$table[[1]])),
                appendLF = F)
        if (length(body) < testsize) break
      }
    }
    message(crayon::green(" Done\n"))
    if (any(startsWith(unexpected_fields, paste0(tab$table[[1]], ".")))) {
      message(
        stringr::str_pad(
          sprintf("\r Ignored unexpected column(s) '%s'\n",
                  paste(unexpected_fields[startsWith(unexpected_fields, paste0(tab$table[[1]], "."))], collapse = "', '")),
          width = 80, "right")
      )
    }
    if (any(startsWith(missing_fields, paste0(tab$table[[1]], ".")))) {
      message(
        stringr::str_pad(
          sprintf("\r Missing column(s) '%s'\n",
                  paste(missing_fields[startsWith(missing_fields, paste0(tab$table[[1]], "."))], collapse = "', '")),
          width = 80, "right")
      )
    }
  })
  RSQLite::dbDisconnect(dbcon)
  if (write_log) {
    logfile      <- file.path(destination, paste0(basename(source), ".log"))
    downloadinfo <- file.path(destination, paste0(basename(source), "_cit.txt"))
    if (file.exists(logfile)) invisible(file.remove(logfile))
    if (file.exists(downloadinfo)) invisible(file.remove(downloadinfo))
    writeLines(text = sprintf(
      paste(c(
        "ECOTOXr SQLite log\n",
        "Source:        %s", "Destination:   %s", "Download info: %s", "Build with:    %s",
        "Build on:      %s", "Build date:    %s", "Missing tbls:  %s", "Missing flds:  %s",
        "Unexp. fields: %s", "Incomplete?:   %s"), collapse = "\n"),
      source,
      destination,
      ifelse(file.exists(downloadinfo), downloadinfo, "Not available"),
      paste0("ECOTOXr V", utils::packageVersion("ECOTOXr")),
      paste(Sys.info()[c("sysname", "release")], collapse = " "),
      format(Sys.Date(), "%Y-%m-%d"),
      paste(missing_tables, collapse = ", "),
      paste(missing_fields, collapse = ", "),
      paste(unexpected_fields, collapse = ", "),
      paste(incomplete_check, collapse = ", ")
    ),
    con = logfile)
  }
  return(invisible(NULL))
}

Try the ECOTOXr package in your browser

Any scripts or data that you put into this service are public.

ECOTOXr documentation built on Oct. 10, 2023, 1:05 a.m.