R/extr_comptox.R

Defines functions extr_comptox_ extr_comptox

Documented in extr_comptox

#' Download and Extract Data from CompTox Chemistry Dashboard
#'
#' This function interacts with the CompTox Chemistry Dashboard to download and
#' extract a wide range of chemical data based on user-defined search criteria.
#' It allows for flexible input types and supports downloading various chemical
#' properties, identifiers, and predictive data. It was inspired by the
#' `ECOTOXr::websearch_comptox` function.
#'
#' @param ids A character vector containing the items to be searched within the
#'   CompTox Chemistry Dashboard. These can be chemical names, CAS Registry Numbers
#'  (CASRN), InChIKeys, or DSSTox substance identifiers (DTXSID).
#' @param download_items A character vector of items to be downloaded.
#'   This includes a comprehensive set of chemical properties, identifiers, predictive
#'   data, and other relevant information. By Default, it download all the info
#' \describe{
#'   \item{CASRN}{The Chemical Abstracts Service Registry Number, a unique numerical
#'      identifier for chemical substances.}
#'   \item{INCHIKEY}{The hashed version of the full International Chemical
#'      Identifier (InChI) string.}
#'   \item{IUPAC_NAME}{The International Union of Pure and Applied Chemistry
#'      (IUPAC) name of the chemical.}
#'   \item{SMILES}{The Simplified Molecular Input Line Entry System (SMILES)
#'      representation of the chemical structure.}
#'   \item{INCHI_STRING}{The full International Chemical Identifier (InChI) string.}
#'   \item{MS_READY_SMILES}{The SMILES representation of the chemical structure,
#'      prepared for mass spectrometry analysis.}
#'   \item{QSAR_READY_SMILES}{The SMILES representation of the chemical structure,
#'      prepared for quantitative structure-activity relationship (QSAR) modeling.}
#'   \item{MOLECULAR_FORMULA}{The chemical formula representing the number and type
#'      of atoms in a molecule.}
#'   \item{AVERAGE_MASS}{The average mass of the molecule, calculated based on the
#'      isotopic distribution of the elements.}
#'   \item{MONOISOTOPIC_MASS}{The mass of the molecule calculated using the most
#'      abundant isotope of each element.}
#'   \item{QC_LEVEL}{The quality control level of the data.}
#'   \item{SAFETY_DATA}{Safety information related to the chemical.}
#'   \item{EXPOCAST}{Exposure predictions from the EPA's ExpoCast program.}
#'   \item{DATA_SOURCES}{Sources of the data provided.}
#'   \item{TOXVAL_DATA}{Toxicological values related to the chemical.}
#'   \item{NUMBER_OF_PUBMED_ARTICLES}{The number of articles related to the chemical
#'      in PubMed.}
#'   \item{PUBCHEM_DATA_SOURCES}{Sources of data from PubChem.}
#'   \item{CPDAT_COUNT}{The number of entries in the Chemical and Product Categories
#'      Database (CPDat).}
#'   \item{IRIS_LINK}{Link to the EPA's Integrated Risk Information System (IRIS)
#'     entry for the chemical.}
#'   \item{PPRTV_LINK}{Link to the EPA's Provisional Peer-Reviewed Toxicity Values
#'     (PPRTV) entry for the chemical.}
#'   \item{WIKIPEDIA_ARTICLE}{Link to the Wikipedia article for the chemical.}
#'   \item{QC_NOTES}{Notes related to the quality control of the data.}
#'   \item{ABSTRACT_SHIFTER}{Information related to the abstract shifter.}
#'   \item{TOXPRINT_FINGERPRINT}{The ToxPrint chemoinformatics fingerprint of the chemical.}
#'   \item{ACTOR_REPORT}{The Aggregated Computational Toxicology Resource (ACTOR)
#'      report for the chemical.}
#'   \item{SYNONYM_IDENTIFIER}{Identifiers for synonyms of the chemical.}
#'   \item{RELATED_RELATIONSHIP}{Information on related chemicals.}
#'   \item{ASSOCIATED_TOXCAST_ASSAYS}{Assays associated with the chemical
#'      in the ToxCast database.}
#'   \item{TOXVAL_DETAILS}{Details of toxicological values.}
#'   \item{CHEMICAL_PROPERTIES_DETAILS}{Details of the chemical properties.}
#'   \item{BIOCONCENTRATION_FACTOR_TEST_PRED}{Predicted bioconcentration factor from
#'      tests.}
#'   \item{BOILING_POINT_DEGC_TEST_PRED}{Predicted boiling point in degrees Celsius
#'      from tests.}
#'   \item{48HR_DAPHNIA_LC50_MOL/L_TEST_PRED}{Predicted 48-hour LC50 for Daphnia
#'      in mol/L from tests.}
#'   \item{DENSITY_G/CM^3_TEST_PRED}{Predicted density in g/cm³ from tests.}
#'   \item{DEVTOX_TEST_PRED}{Predicted developmental toxicity from tests.}
#'   \item{96HR_FATHEAD_MINNOW_MOL/L_TEST_PRED}{Predicted 96-hour LC50 for fathead
#'       minnow in mol/L from tests.}
#'   \item{FLASH_POINT_DEGC_TEST_PRED}{Predicted flash point in degrees Celsius
#'      from tests.}
#'   \item{MELTING_POINT_DEGC_TEST_PRED}{Predicted melting point in degrees Celsius
#'      from tests.}
#'   \item{AMES_MUTAGENICITY_TEST_PRED}{Predicted Ames mutagenicity from tests.}
#'   \item{ORAL_RAT_LD50_MOL/KG_TEST_PRED}{Predicted oral LD50 for rats in mol/kg
#'      from tests.}
#'   \item{SURFACE_TENSION_DYN/CM_TEST_PRED}{Predicted surface tension in dyn/cm
#'      from tests.}
#'   \item{THERMAL_CONDUCTIVITY_MW_M×K_TEST_PRED}{Predicted thermal conductivity
#'      in mW/m×K from tests.}
#'   \item{TETRAHYMENA_PYRIFORMIS_IGC50_MOL/L_TEST_PRED}{Predicted IGC50 for Tetrahymena
#'      pyriformis in mol/L from tests.}
#'   \item{VISCOSITY_CP_CP_TEST_PRED}{Predicted viscosity in cP from tests.}
#'   \item{VAPOR_PRESSURE_MMHG_TEST_PRED}{Predicted vapor pressure in mmHg from
#'      tests.}
#'   \item{WATER_SOLUBILITY_MOL/L_TEST_PRED}{Predicted water solubility in mol/L
#'      from tests.}
#'   \item{ATMOSPHERIC_HYDROXYLATION_RATE_\(AOH\)_CM3/MOLECULE\*SEC_OPERA_PRED}{Predicted
#'      atmospheric hydroxylation rate in cm³/molecule\*sec from OPERA.}
#'   \item{BIOCONCENTRATION_FACTOR_OPERA_PRED}{Predicted bioconcentration factor from OPERA.}
#'   \item{BIODEGRADATION_HALF_LIFE_DAYS_DAYS_OPERA_PRED}{Predicted biodegradation
#'      half-life in days from OPERA.}
#'   \item{BOILING_POINT_DEGC_OPERA_PRED}{Predicted boiling point in degrees
#'      Celsius from OPERA.}
#'   \item{HENRYS_LAW_ATM-M3/MOLE_OPERA_PRED}{Predicted Henry's law constant
#'      in atm-m³/mole from OPERA.}
#'   \item{OPERA_KM_DAYS_OPERA_PRED}{Predicted Km in days from OPERA.}
#'   \item{OCTANOL_AIR_PARTITION_COEFF_LOGKOA_OPERA_PRED}{Predicted octanol-air
#'      partition coefficient (log Koa) from OPERA.}
#'   \item{SOIL_ADSORPTION_COEFFICIENT_KOC_L/KG_OPERA_PRED}{Predicted soil adsorption
#'      coefficient (Koc) in L/kg from OPERA.}
#'   \item{OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED}{Predicted octanol-water partition
#'      coefficient (log P) from OPERA.}
#'   \item{MELTING_POINT_DEGC_OPERA_PRED}{Predicted melting point in degrees Celsius
#'      from OPERA.}
#'   \item{OPERA_PKAA_OPERA_PRED}{Predicted pKa (acidic) from OPERA.}
#'   \item{OPERA_PKAB_OPERA_PRED}{Predicted pKa (basic) from OPERA.}
#'   \item{VAPOR_PRESSURE_MMHG_OPERA_PRED}{Predicted vapor pressure in mmHg from
#'      OPERA.}
#'   \item{WATER_SOLUBILITY_MOL/L_OPERA_PRED}{Predicted water solubility in mol/L
#'      from OPERA.}
#'   \item{EXPOCAST_MEDIAN_EXPOSURE_PREDICTION_MG/KG-BW/DAY}{Predicted median
#'     exposure from ExpoCast in mg/kg-bw/day.}
#'   \item{NHANES}{National Health and Nutrition Examination Survey data.}
#'   \item{TOXCAST_NUMBER_OF_ASSAYS/TOTAL}{Number of assays in ToxCast.}
#'   \item{TOXCAST_PERCENT_ACTIVE}{Percentage of active assays in ToxCast.}
#' }
#' @param mass_error Numeric value indicating the mass error tolerance for searches
#'   involving mass data. Default is `0`.
#' @param verify_ssl Logical value indicating whether SSL certificates should be
#'   verified. Default is `FALSE`. Note that this argument is not used on linux OS.
#' @param verbose A logical value indicating whether to print detailed messages.
#'   Default is TRUE.
#' @param ... Additional arguments passed to `httr2::req_options()`.
#' @details
#' Please note that this function, which pulls data from EPA servers, may encounter
#'   issues on some Linux systems. This is because those servers do not accept secure
#'   legacy renegotiation. On Linux systems, the current function depends on \code{curl}
#'   and \code{OpenSSL}, which have known problems with unsafe legacy renegotiation
#'   in newer versions. One workaround is to downgrade to \code{curl v7.78.0} and
#'   \code{OpenSSL v1.1.1}. However, please be aware that using these older versions
#'   might introduce potential security vulnerabilities. Refer to
#'   \href{https://gist.github.com/c1au6i0/5cc2d87966340a31032ffebf1cfb657c}{this gist}
#'   for instructions on how to downgrade \code{curl} and \code{OpenSSL} on Ubuntu.
#' @seealso \href{https://www.epa.gov/comptox-tools/comptox-chemicals-dashboard-resource-hub}{CompTox Chemicals Dashboard Resource Hub}
#' @return A cleaned data frame containing the requested data from CompTox.
#' @export
#' @examples
#' \donttest{
#' # Example usage of the function:
#' extr_comptox(ids = c("Aspirin", "50-00-0"))
#' }
extr_comptox <- function(ids,
                         download_items = c(
                           "CASRN",
                           "INCHIKEY",
                           "IUPAC_NAME",
                           "SMILES",
                           "INCHI_STRING",
                           "MS_READY_SMILES",
                           "QSAR_READY_SMILES",
                           "MOLECULAR_FORMULA",
                           "AVERAGE_MASS",
                           "MONOISOTOPIC_MASS",
                           "QC_LEVEL",
                           "SAFETY_DATA",
                           "EXPOCAST",
                           "DATA_SOURCES",
                           "TOXVAL_DATA",
                           "NUMBER_OF_PUBMED_ARTICLES",
                           "PUBCHEM_DATA_SOURCES",
                           "CPDAT_COUNT",
                           "IRIS_LINK",
                           "PPRTV_LINK",
                           "WIKIPEDIA_ARTICLE",
                           "QC_NOTES",
                           "ABSTRACT_SHIFTER",
                           "TOXPRINT_FINGERPRINT",
                           "ACTOR_REPORT",
                           "SYNONYM_IDENTIFIER",
                           "RELATED_RELATIONSHIP",
                           "ASSOCIATED_TOXCAST_ASSAYS",
                           "TOXVAL_DETAILS",
                           "CHEMICAL_PROPERTIES_DETAILS",
                           "BIOCONCENTRATION_FACTOR_TEST_PRED",
                           "BOILING_POINT_DEGC_TEST_PRED",
                           "48HR_DAPHNIA_LC50_MOL/L_TEST_PRED",
                           "DENSITY_G/CM^3_TEST_PRED",
                           "DEVTOX_TEST_PRED",
                           "96HR_FATHEAD_MINNOW_MOL/L_TEST_PRED",
                           "FLASH_POINT_DEGC_TEST_PRED",
                           "MELTING_POINT_DEGC_TEST_PRED",
                           "AMES_MUTAGENICITY_TEST_PRED",
                           "ORAL_RAT_LD50_MOL/KG_TEST_PRED",
                           "SURFACE_TENSION_DYN/CM_TEST_PRED",
                           "THERMAL_CONDUCTIVITY_MW/(M*K)_TEST_PRED",
                           "TETRAHYMENA_PYRIFORMIS_IGC50_MOL/L_TEST_PRED",
                           "VISCOSITY_CP_CP_TEST_PRED",
                           "VAPOR_PRESSURE_MMHG_TEST_PRED",
                           "WATER_SOLUBILITY_MOL/L_TEST_PRED",
                           "ATMOSPHERIC_HYDROXYLATION_RATE_(AOH)_CM3/MOLECULE*SEC_OPERA_PRED",
                           "BIOCONCENTRATION_FACTOR_OPERA_PRED",
                           "BIODEGRADATION_HALF_LIFE_DAYS_DAYS_OPERA_PRED",
                           "BOILING_POINT_DEGC_OPERA_PRED",
                           "HENRYS_LAW_ATM-M3/MOLE_OPERA_PRED",
                           "OPERA_KM_DAYS_OPERA_PRED",
                           "OCTANOL_AIR_PARTITION_COEFF_LOGKOA_OPERA_PRED",
                           "SOIL_ADSORPTION_COEFFICIENT_KOC_L/KG_OPERA_PRED",
                           "OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED",
                           "MELTING_POINT_DEGC_OPERA_PRED",
                           "OPERA_PKAA_OPERA_PRED",
                           "OPERA_PKAB_OPERA_PRED",
                           "VAPOR_PRESSURE_MMHG_OPERA_PRED",
                           "WATER_SOLUBILITY_MOL/L_OPERA_PRED",
                           "EXPOCAST_MEDIAN_EXPOSURE_PREDICTION_MG/KG-BW/DAY",
                           "NHANES",
                           "TOXCAST_NUMBER_OF_ASSAYS/TOTAL",
                           "TOXCAST_PERCENT_ACTIVE"
                         ),
                         mass_error = 0,
                         verify_ssl = FALSE,
                         verbose = TRUE,
                         ...) {
  if (base::missing(ids)) {
    cli::cli_abort("The argument {.field ids} is required.")
  }

  check_internet(verbose = verbose)
  xlsx_file <- tempfile(fileext = ".xlsx")


  base_url <- "https://comptox.epa.gov/dashboard-api/batchsearch/export/?lb2ljny4"

  resp <- extr_comptox_(
    ids = ids,
    download_items = download_items,
    mass_error = mass_error,
    verify_ssl = verify_ssl,
    xlsx_file = xlsx_file,
    base_url = base_url,
    verbose = verbose,
    ...
  )

  sheet_names <- readxl::excel_sheets(xlsx_file)

  suppressMessages(
    dat_list <- lapply(sheet_names, readxl::read_excel, path = xlsx_file)
  )

  names(dat_list) <- paste0("comptox_", gsub(" ", "_", tolower(sheet_names)))
  unlink(xlsx_file)

  out <- lapply(dat_list, janitor::clean_names)

  # This is to get the ids that were not retrieved
  dat <- out$comptox_main_data

  ids_not_found <- dat$input[grepl("Found 0|fails checksum", dat$found_by)]

  if (all(isTRUE(verbose), length(ids_not_found) != 0)) {
    cli::cli_warn("Chemical{?s} {.field {ids_not_found}} not found!")
  }

  out
}

#' @inherit extr_comptox title description
#' @keywords internal
#' @noRd
#' @inheritParams extr_comptox
#' @param  xlsx_file Path to file to write with results.
#' @param  base_url Comptox url.
extr_comptox_ <- function(ids,
                          download_items = NULL,
                          mass_error = 0,
                          verify_ssl = FALSE,
                          xlsx_file,
                          base_url,
                          verbose = TRUE,
                          ...) {
  libcurl_opt <- set_ssl(verify_ssl = verify_ssl, other_opt = ...)

  identifier_types <- c("chemical_name", "CASRN", "INCHIKEY", "dtxsid")
  input_type <- "IDENTIFIER"

  download_items <- c("DTXCID", download_items)

  params <- list(
    identifierTypes = identifier_types,
    massError = mass_error,
    downloadItems = download_items,
    searchItems = paste0(ids, collapse = "\n"),
    inputType = input_type,
    downloadType = "EXCEL"
  )

  if (isTRUE(verbose)) {
    cli::cli_alert_info("Sending request to CompTox...")
  }

  error_result <- NULL

  post_result <- tryCatch(
    {
      httr2::request(base_url) |>
        httr2::req_retry(max_tries = 2, backoff = ~3) |>
        httr2::req_body_json(params) |>
        httr2::req_options(!!!libcurl_opt) |>
        httr2::req_method("POST") |>
        httr2::req_perform()
    },
    error = function(e) {
      error_result <<- e
      NULL
    }
  )

  msg <- "Failed to perform the request: {conditionMessage(error_result)}"

  if (!is.null(error_result)) {
    if (grepl("unsafe legacy renegotiation disabled", conditionMessage(error_result))) {
      msg <- c(msg, "", cli::style_italic("!If you are using openssl, you might need to downgrade to curl v7.78.0, openssl v1.1.1!"))
    }
    cli::cli_abort(msg)
  }

  check_status_code(post_result, verbose = verbose)

  response_body <- httr2::resp_body_string(post_result)

  # base url for download
  base_url_down <- "https://comptox.epa.gov/dashboard-api/batchsearch/export/content/"

  if (isTRUE(verbose)) {
    cli::cli_alert_info("Getting info from CompTox...")
  }

  Sys.sleep(5)

  resp <- tryCatch(
    {
      httr2::request(paste0(base_url_down, response_body)) |>
        httr2::req_options(!!!libcurl_opt) |>
        httr2::req_perform()
    },
    error = function(e) {
      cli::cli_abort("Failed to perform the request: {conditionMessage(e)}")
    }
  )

  check_status_code(resp, verbose = verbose)

  resp |>
    httr2::resp_body_raw() |>
    writeBin(xlsx_file)
}

Try the extractox package in your browser

Any scripts or data that you put into this service are public.

extractox documentation built on April 4, 2025, 2:30 a.m.