R/create_toxEval.R

Defines functions as.toxEval allowed_names summary.toxEval rm_em_dash check_cols create_toxEval

Documented in as.toxEval create_toxEval summary.toxEval

#' Load and check toxEval data
#'
#' This function is used to load a data file for analysis in the form of a
#' single Excel file. The Excel file should include 3 mandatory sheets named
#' "Data", "Chemicals", and "Sites". Additionally there are 2 optional sheets:
#' "Exclude" and "Benchmarks". This function creates a data frame for each
#' sheet,  perform basic checks on the data to assure that required columns are
#' included for each sheet
#'
#' Required columns in the Data sheet include "CAS", "SiteID", "Value", and
#' "Sample Date". The "Value" column includes concentration measurements in
#' units of \eqn{\mu}g/L. "Sample Date" can be either a date or date/time or an integer.
#' Additional columns may be included for user purposes, but will not be used in
#' toxEval.
#'
#' Required columns in the Chemical sheet include "CAS", "Class". "CAS" values in
#' this sheet must exactly match corresponding "CAS" values in the Data sheet. The
#' "Class" designation allows  data to be grouped in a user-specified way. For
#' example, in a data set of multiple pesticides, it may be valuable to explore
#' differences and similarities to of insecticides, herbicides and fungicides.
#' Additional columns may be included for user purposes, but will not be used in
#' toxEval.
#'
#' Required columns in the Sites sheet includes "SiteID", "Short Name", and for
#' the Shiny application "dec_lat","dec_lon". Values in the "SiteID" column in
#' this sheet exactly match corresponding values in the "SiteID" column in the
#' Data sheet. Additional columns may be included for user purposes, but will not
#' be used in toxEval.
#'
#' When using the optional sheet Exclude, columns required include "CAS" and
#' "endPoint". These are used to exclude particular chemicals (via CAS),
#' ToxCast endpoints (via endPoint), or a unique chemical/endpoint combination.
#' Additional columns may be included for user purposes, but will not be used
#' in toxEval.
#'
#' When using the optional sheet Benchmarks, columns required include "CAS",
#' "endPoint","ACC_value" and "chnm". This sheet is used to over-ride the
#' functions using endpoints from the ToxCast database, allowing the user
#' to import endpoint information from other sources. It could also be useful
#' for reproducing results in the future (for example, if after ToxCast updates,
#' analysis with an older version of ToxCast may be reproduced by including the
#' selected ToxCast endpoint database in this sheet. Additional columns may be included for user
#' purposes, but will not be used in toxEval.
#'
#' For more information, see the "Prepare Data" vignette: \href{../doc/PrepareData.html}{\code{vignette("PrepareData", package = "toxEval")}}.
#'
#' All remaining toxEval functions use data from via the list that is returned
#' from this function.
#'
#' @return The object returned from this function contains a list of between
#' three and five data frames. The minimum data frames returned are chem_data
#' (containing at least the columns: "CAS", "SiteID", "Value", "Sample Date"),
#' chem_info (containing at least the columns: "CAS", "Class"), and
#' chem_site (containing at least the columns: "SiteID", "Short Name". For the
#' Shiny app, "dec_lat" and "dec_lon" are also required). The optional data
#' frames are exclusions (containing at least the columns: "CAS" and "endPoint"),
#' and benchmarks (containing at least the columns: "CAS", "endPoint",
#' "ACC_value" and "chnm")
#'
#' @param excel_file_path Path to Excel file that contains at least 3 sheets: Data, Chemicals, and Sites,
#' and could optionally contain Exclude and Benchmarks.
#' @param \dots data frames to override data within the original x list.
#'
#' @export
#' @examples
#'
#' path_to_tox <- system.file("extdata", package = "toxEval")
#' file_name <- "OWC_data_fromSup.xlsx"
#' excel_file_path <- file.path(path_to_tox, file_name)
#' tox_list <- create_toxEval(excel_file_path)
create_toxEval <- function(excel_file_path, ...) {
  if (!file.exists(excel_file_path)) {
    stop("File does not exist, check path and spelling")
  }

  if ("Chemicals" %in% readxl::excel_sheets(excel_file_path)) {
    chem_info <- readxl::read_excel(excel_file_path,
      sheet = "Chemicals",
      guess_max = 10000
    )

    names(chem_info) <- names(chem_info) %>%
      allowed_names(c("casrn", "casn", "CASRN", "CASN"), "CAS")

    if ("CAS" %in% names(chem_info)) {
      chem_info$CAS <- as.character(chem_info$CAS)
    } else {
      message("Chemical tab missing CAS column")
    }
  }

  if ("Data" %in% readxl::excel_sheets(excel_file_path)) {
    chem_data <- readxl::read_excel(excel_file_path,
      sheet = "Data",
      guess_max = 100000
    )

    names(chem_data) <- names(chem_data) %>%
      allowed_names(c("site", "Site", "Station", "station"), "SiteID") %>%
      allowed_names(c("Date", "date", "sample", "Sample"), "Sample Date") %>%
      allowed_names(c("casrn", "casn", "CASRN", "CASN"), "CAS")

    if ("CAS" %in% names(chem_data)) {
      chem_data$CAS <- as.character(chem_data$CAS)
    } else {
      message("Data tab missing CAS column")
    }

    if ("SiteID" %in% names(chem_data)) {
      chem_data$SiteID <- as.character(chem_data$SiteID)
    } else {
      message("Data tab missing SiteID column")
    }
  }

  if ("Sites" %in% readxl::excel_sheets(excel_file_path)) {
    chem_site <- readxl::read_excel(excel_file_path,
      sheet = "Sites",
      guess_max = 5000
    )

    names(chem_site) <- names(chem_site) %>%
      allowed_names(c("site", "Site", "Station", "station"), "SiteID") %>%
      allowed_names(c("shortName", "map_name"), "Short Name") %>%
      allowed_names(c("dec_long", "longitude", "long"), "dec_lon") %>%
      allowed_names(c("lat", "latitude"), "dec_lat")

    if ("SiteID" %in% names(chem_site)) {
      chem_site$SiteID <- as.character(chem_site$SiteID)
    } else {
      message("Sites tab missing SiteID column")
    }
  }

  if ("Exclude" %in% readxl::excel_sheets(excel_file_path)) {
    exclusions <- readxl::read_excel(excel_file_path,
      sheet = "Exclude",
      guess_max = 1000
    )
  } else {
    exclusions <- NULL
  }

  if ("Benchmarks" %in% readxl::excel_sheets(excel_file_path)) {
    benchmarks <- readxl::read_excel(excel_file_path,
      sheet = "Benchmarks",
      guess_max = 10000
    )
    if (nrow(benchmarks) == 0) {
      benchmarks <- NULL
    }
  } else {
    benchmarks <- NULL
  }

  rawTox <- list(
    chem_data = chem_data,
    chem_info = chem_info,
    chem_site = chem_site,
    exclusions = exclusions,
    benchmarks = benchmarks
  )

  rawTox <- as.toxEval(rawTox, ...)

  return(rawTox)
}

check_cols <- function(req_cols, tab, tab_name, mandatory = TRUE) {

  # allow column names to be case and whitespace insensitive:
  req_cols_low <- tolower(req_cols)
  req_cols_low <- gsub(" ", "", req_cols_low)
  req_cols_low <- gsub("_", "", req_cols_low)
  req_cols_low <- gsub("\\.", "", req_cols_low)

  orig_names <- names(tab)
  names(tab) <- tolower(names(tab))
  names(tab) <- gsub(" ", "", names(tab))
  names(tab) <- gsub("_", "", names(tab))
  names(tab) <- gsub("\\.", "", names(tab))

  if (!all(req_cols_low %in% names(tab))) {
    miss_col <- req_cols_low[!req_cols_low %in% names(tab)]

    if (mandatory) {
      stop("Missing ", miss_col, " column in ", tab_name)
    } else {
      warning(
        "Missing optional column: ", miss_col, " in", tab_name,
        ".\nThis could cause Shiny app to not work properly"
      )
    }
  }
  # Transform the column names the "required" names:
  names(tab)[!(names(tab) %in% req_cols_low)] <- orig_names[-which(names(tab) %in% req_cols_low)]
  names(tab)[(names(tab) %in% req_cols_low)] <- unlist(sapply(names(tab), function(x) req_cols[which(req_cols_low == x)]))

  tab <- rm_em_dash(tab)

  return(tab)
}


rm_em_dash <- function(df) {
  for (i in as.integer(which(sapply(df, class) == "character"))) {
    df[[i]] <- gsub("\u002D|\u2013|\u2014|\u2212", "-", df[[i]])
  }
  return(df)
}


#' Summary of tox_list
#'
#' A "tox_list" object is created from \code{create_toxEval}. It
#' is a list of  5 data frames: chem_data, chem_info,
#' chem_site, exclusions, and benchmarks. This function returns
#' a message with how many chemicals have ToxCast information,
#' and returns a vector of which chemicals do not have ToxCast information.
#'
#' @param object toxEval object with "chem_info" data frame included.
#' @param \dots additional parameters
#'
#' @export
#' @importFrom magrittr "%>%"
#' @examples
#' path_to_tox <- system.file("extdata", package = "toxEval")
#' file_name <- "OWC_data_fromSup.xlsx"
#' excel_file_path <- file.path(path_to_tox, file_name)
#' tox_list <- create_toxEval(excel_file_path)
#' summary(tox_list)
summary.toxEval <- function(object, ...) {
  CAS <- endPoint <- chnm <- flags <- ".dplyr"

  if (is.null(object[["benchmarks"]])) {
    ACC <- ToxCast_ACC %>%
      filter(CAS %in% unique(object$chem_info$CAS))
    bench_word <- "ToxCast"
  } else {
    ACC <- object[["benchmarks"]]
    bench_word <- "benchmark"
  }

  CAS_w_data <- ACC %>%
    select(CAS) %>%
    distinct() %>%
    pull(CAS)

  message(length(CAS_w_data), " chemicals have ", bench_word, " information")
  message("Chemicals returned from this function do NOT have ", bench_word, " information:")
  return(unique(object$chem_info$CAS)[!(unique(object$chem_info$CAS) %in% CAS_w_data)])
}


allowed_names <- function(col_names, allowable, better) {
  if (any(allowable %in% col_names)) {
    col_names[col_names %in% allowable] <- better
  }
  return(col_names)
}


#' toxEval helper functions
#'
#' A small collection of helper functions for toxEval
#'
#' @export
#' @param x list or toxEval object
#' @param \dots data frames to override data within the original x list.
#' @rdname helperToxEval
#'
#' @examples
#' path_to_tox <- system.file("extdata", package = "toxEval")
#' file_name <- "OWC_data_fromSup.xlsx"
#' full_path <- file.path(path_to_tox, file_name)
#' tox_list <- create_toxEval(full_path)
#'
#' # To over-ride one of the data frames:
#' chem_data <- data.frame(
#'   CAS = "21145-77-7",
#'   Value = 1,
#'   "Sample Date" = as.Date("2012-01-01"),
#'   SiteID = "USGS-04249000"
#' )
#' tox_list_new <- as.toxEval(tox_list, chem_data)
#'
as.toxEval <- function(x, ...) {
  matchReturn <- c(
    do.call("c", list(...)[sapply(list(...), class) == "list"]), # get the list parts
    list(...)[sapply(list(...), class) != "list"]
  )

  if (is.null(names(matchReturn))) {
    names(matchReturn) <- match.call(expand.dots = FALSE)$...
  }

  if ("chem_data" %in% names(matchReturn)) {
    message("Replacing chem_data from list with supplied data frame.")
    x[["chem_data"]] <- matchReturn[["chem_data"]]
  }

  if ("chem_info" %in% names(matchReturn)) {
    message("Replacing chem_info from list with supplied data frame.")
    x[["chem_info"]] <- matchReturn[["chem_info"]]
  }

  if ("chem_site" %in% names(matchReturn)) {
    message("Replacing chem_site from list with supplied data frame.")
    x[["chem_site"]] <- matchReturn[["chem_site"]]
  }

  if ("exclusions" %in% names(matchReturn)) {
    message("Replacing exclusions from list with supplied data frame.")
    x[["exclusions"]] <- matchReturn[["exclusions"]]
  }

  if ("benchmarks" %in% names(matchReturn)) {
    message("Replacing benchmarks from list with supplied data frame.")
    x[["benchmarks"]] <- matchReturn[["benchmarks"]]
  }

  names(x) <- names(x) %>%
    tolower() %>%
    allowed_names("data", "chem_data") %>%
    allowed_names("chemicals", "chem_info") %>%
    allowed_names("sites", "chem_site") %>%
    allowed_names("exclude", "exclusions")

  required_data <- c("chem_data", "chem_info", "chem_site")

  if (!all(required_data %in% names(x))) {
    stop("Missing", required_data[!required_data %in% names(x)])
  }

  chem_data <- x[["chem_data"]]
  chem_info <- x[["chem_info"]]
  chem_site <- x[["chem_site"]]

  if ("benchmarks" %in% names(x)) {
    benchmarks <- x[["benchmarks"]]
  } else {
    benchmarks <- NULL
  }

  if ("exclusions" %in% names(x)) {
    exclusions <- x[["exclusions"]]
  } else {
    exclusions <- NULL
  }

  # chem_data:
  names(chem_data) <- names(chem_data) %>%
    allowed_names(c("site", "Site", "Station", "station"), "SiteID") %>%
    allowed_names(c("Date", "date", "sample", "Sample"), "Sample Date") %>%
    allowed_names(c("casrn", "casn", "CASRN", "CASN"), "CAS")

  chem_data <- check_cols(c("CAS", "SiteID", "Value", "Sample Date"), chem_data, tab_name = "chem_data")

  if (!is.numeric(chem_data$Value)) {
    stop("The 'Value' column in the Data sheet is not numeric")
  }

  # chem_info:
  names(chem_info) <- names(chem_info) %>%
    allowed_names(c("casrn", "casn", "CASRN", "CASN"), "CAS")

  chem_info <- check_cols(c("CAS", "Class"), chem_info, tab_name = "chem_info")

  # chem_site:
  names(chem_site) <- names(chem_site) %>%
    allowed_names(c("site", "Site", "Station", "station"), "SiteID") %>%
    allowed_names(c("shortName", "map_name"), "Short Name") %>%
    allowed_names(c("dec_long", "longitude", "long"), "dec_lon") %>%
    allowed_names(c("lat", "latitude"), "dec_lat")

  chem_site <- check_cols(c("SiteID", "Short Name"), chem_site, tab_name = "chem_site")

  # Check columns needed for shiny app:
  chem_site <- check_cols(c("dec_lat", "dec_lon"),
    chem_site,
    mandatory = FALSE, tab_name = "chem_site"
  )

  if (!("site_grouping" %in% names(chem_site))) {
    chem_site$site_grouping <- ""
  }

  if (!is.null(exclusions)) {
    names(exclusions) <- names(exclusions) %>%
      allowed_names(c("casrn", "casn", "CASRN", "CASN"), "CAS")

    exclusions <- check_cols(c("CAS", "endPoint"), exclusions, tab_name = "exclusions")
  }

  if (!is.null(benchmarks)) {
    names(benchmarks) <- names(benchmarks) %>%
      allowed_names(c("casrn", "casn", "CASRN", "CASN"), "CAS") %>%
      allowed_names(c("Value", "value", "ACC", "ACC_value"), "ACC_value") %>%
      allowed_names(c("chemical", "chnm", "Chemical", "Compound"), "chnm")

    benchmarks <- check_cols(c("CAS", "endPoint", "ACC_value", "chnm"),
      benchmarks,
      tab_name = "benchmarks"
    )
    if (!("groupCol" %in% names(benchmarks))) {
      benchmarks$groupCol <- "Benchmarks"
    }
  }

  # Check that all CAS in Data in Chemical
  data_cas <- unique(chem_data$CAS)
  chem_cas <- unique(chem_info$CAS)
  if (!all(data_cas %in% chem_cas)) {
    warning("Not all CAS in the Data sheet are listed in the Chemical sheet")
  }

  # Check that all SiteID in Data in Sites
  data_siteID <- unique(chem_data$SiteID)
  site_siteID <- unique(chem_site$SiteID)
  if (!all(data_siteID %in% site_siteID)) {
    warning("Not all SiteID in the Data sheet are listed in the Sites sheet")
  }
  rawData <- list(
    chem_data = chem_data,
    chem_info = chem_info,
    chem_site = chem_site,
    exclusions = exclusions,
    benchmarks = benchmarks
  )

  class(rawData) <- "toxEval"

  return(rawData)
}

Try the toxEval package in your browser

Any scripts or data that you put into this service are public.

toxEval documentation built on April 15, 2023, 1:13 a.m.