R/process_unit.r
In ECOTOXr: Download and Extract Data from US EPA's ECOTOX Database

Documented in as_unit_ecotox mixed_to_single_unit process_ecotox_units

.ut_sep0 <- "[ :/*^]"                      ## separator characters of unit fragment;
                                           ## essentially maths operators and spaces
.ut_sep1 <- sprintf("(?<=^|%s)", .ut_sep0) ## separator indicating start of unit fragment
.ut_sep2 <- sprintf("(?=$|%s)",  .ut_sep0) ## separator indicating end of unit fragment

## Commonly occurring annotations in units:

.ut_suffix <- 
  c("act", "AE", "ae", "arb", "ash", "ATP", "B-nap", "B-naph", "BAPNA", "bdwt", "blood",
    "BO", "bt", "BTEE", "BW", "C", "C2H4", "Ca", "CaO", "CaCO3", "caliper", "canopy", "cap",
    "CCT", "CEC", "change", "clay", "clitellate", "CNTL", "CO2", "co2", "CO3", "com", "corn",
    "cortex", "CREA", "d soil", "dev", "diet", "DNA", "DOPA", "dose", "dry", "DOPA", "DT",
    "earliness", "EDTA", "egg", "eu", "EU", "evolved", "FA", "FATL", "fertile", "fl", "fluid",
    "FM", "folium", "GAIN", "GH", "H2O", "H2O2", "HA", "Hb", "HCl", "HCO3", "Hg", "ht", "humus",
    "in", "ingested", "INHIB", "initial", "intake", "K[+]", "lipid", "LIT", "lit", "litter",
    "lf", "mat", "max", "MDA", "mdhyde", "media", "Mg", "min", "MIT", "MO", "N", "NaCl", "NH3",
    "NO2", "node", "O2", "OC", "of", "OH", "oil", "OM", "org", "P", "P[+]", "p450", "PBG", "PC",
    "phenolphth", "PLIPD", "plt", "pod", "pro", "prod", "PRTL", "RBC", "RBCE", "ret", "RI",
    "RNA", "ro", "root", "S", "sat", "sd", "sed", "seed", "soil", "soln", "solvent", "srtl",
    "SP", "sperm", "TE", "TEQ", "TI", "TIME", "tolerance", "total", "UA", "urea", "WBC", "WDTH",
    "wet", "wght", "WGHT", "WSF", "wt", "yld", "Zn")

## Equivalents for 'durations':

.ut_second   <-
  c("s", "sec", "spf", "spref")

.ut_minute   <-
  c("mi", "mpf", "mph")

.ut_hour   <-
  c("h", "hbe", "hbf", "hbh", "hpe", "hpel", "hpf", "hph", "hpp", "hpr", "hps")

.ut_day   <-
  c("BDAY", "dapu", "dayef", "dayg", "dayl", "daym", "dayv", "dbh", "dge", "dla", "dpe",
    "dpel", "dpes", "dpf", "dpfg", "dpfl", "dpgm", "dph", "dph", "dphv", "dpm", "dpmm",
    "dpn", "dpo", "dpp", "dpr", "dpref", "dps", "dpu", "dpw", "dpys", "dws", "hd")

.ut_week   <-
  c("wk", "wks", "wkpm", "wkprf", "wpe", "wph", "wphv", "wpp", "wps", "wpv3", "wpv5")

.ut_month   <-
  c("mo", "mope", "mopf", "moph", "mopm", "mopres", "mopswm", "mpgm", "mpp")

.ut_year   <-
  c("yr", "yc", "yph")

## Unit fragments representing counts
## Note that cc only stands for 'cubic centimetres' in case of 'cc/L'.
## In all other cases it is 'cocoons'.
.ut_counts <- c("ad", "(beat(s?))", "(bee(s?))", "(branch((es)?))", "bt", "bud",
                "(burrow(s?))", "cc", "(cel(l?)(s?))", "(cast((ing)?)(s?))", "chem",
                "(cluster(s?))", "(clutch((es)?))", "cntr", "(cyc((le)?)(s?))", "dead",
                "(egg(s?))", "(em((bryo)?)(s?))", "(failure(s?))", "(fet((us)?)((es)?))",
                "(fibre(s?))", "(field(s?))", "FM", "gland", "jv", "live", "ML",
                "(neuron(s?))", "no", "nuclei", "org", "(pair(s?))", "panicle",
                "(pl((ot)?(s?)))", "(pot(s?))", "preg", "raft", "(seed(s?))", "sad",
                "section", "(trail(s?))", "tubule")

## Straight forward replacements to make units more explicit
.ut_explicit <-
  data.frame(
    pattern     = c("lbs", "Nm", "ac",   "0/00", "wk(s?)", "u(g?)-atoms",
                    "pph(r?)", "[*]dyn", "gTI", "ga",  "cc/L",  "mi", "mo", "deg",
                    "sqft", "PH",           "pm",          "fs",             "mgL"),
    replacement = c("lb",  "nm", "acre", "ppt",  "week",   "ug",
                    "10ppt",   "dyne",   "g",   "gal", "cm3/L", "min", "month", "degree",
                    "ft2",  "pithardening", "postmolting", "floweringstage", "mg*L")
  )

## Function to replace unit fragments in text

.replace_ut_frag <- function(pattern, replacement, x) {
  gsub(paste0(.ut_sep1, pattern, .ut_sep2), replacement, x, perl = TRUE)
}

## Function to convert database field name to unit type

.field_to_unit_type <- function(field) {
  type <- .db_specs$foreign_key[endsWith(field, .db_specs$field_name)]
  type <- gsub("[_].*", "", type) |> unique()
  if (length(type) == 0) return("unknown")
  if (type == "sample") type <- "size"
  type
}

#' Process ECOTOX search results by converting `character` to units where relevant
#' 
#' `r lifecycle::badge('experimental')` The function `search_ecotox()` returns fields
#' from the ECOTOX database as is. Fields that represent units are not standardised in
#' the database. Therefore, this format is not consistently used throughout the
#' database. `process_ecotox_units()` takes a `data.frame` returned by
#' `search_ecotox()`, locates unit columns, represented by text, sanitises the text
#' and converts them to `units::mixed_units()` objects. It will sanitise the unit fields as
#' much as possible. Units that could not be interpreted are returned as arbitrary `unit`.
#' @param x A `data.frame` obtained with `search_ecotox()`, for which the units need
#' to be processed.
#' @param .fns Function to convert `character` to unit. By default `as_unit_ecotox()`
#' is used which also sanitises the input. You can also write a custom function.
#' @param ... Arguments passed to `.fns`.
#' @param .names A 'glue' specification used to rename the unit columns. By default
#' it is `"{.col}"`, which will overwrite existing text columns with unit columns.
#' You can for instance add a suffix with `"{.col}_unit"` if you want to
#' rename the resulting unit columns.
#' @returns Returns a `data.frame` in which the columns containing unit information
#' is converted from the character format from the database to actual unit objects (
#' `?units::units`).
#' @author Pepijn de Vries
#' @examples
#' if (check_ecotox_availability()) {
#'   df <- search_ecotox(
#'     list(
#'       latin_name    = list(
#'         terms          = c("Skeletonema", "Daphnia"),
#'         method         = "contains"
#'       ),
#'       chemical_name = list(
#'         terms          = "benzene",
#'         method         = "exact"
#'       )
#'     ), list_ecotox_fields("full"))
#'
#'   df_unit <-
#'     process_ecotox_units(df, warn = FALSE)
#' }
#' @family ecotox-sanitisers
#' @export
process_ecotox_units <- function(x, .fns = as_unit_ecotox, ..., .names = NULL) {
  ## identify unit columns
  unit_columns <- c(.db_specs$field_name[grepl("_unit$", .db_specs$field_name)], "_unit")
  patt <- sprintf("^.*?(%s)$",
                  paste(unit_columns, collapse = "|"))
  
  x |>
    dplyr::mutate(
      dplyr::across(
        .cols = dplyr::matches(patt, perl = TRUE),
        .fns  = ~ {
          if ("type" %in% names(formals(.fns))) {
            .fns(.x, type = .field_to_unit_type(dplyr::cur_column()), ...)
          } else {
            .fns(.x, ...)
          }
        },
        .names = .names
      )
    )
}

#' Text from the ECOTOX database to `mixed_units`
#' 
#' `r lifecycle::badge('experimental')` Convert text to units after
#' sanitising.
#' 
#' The following steps are performed (in the order as listed)
#' to sanitise text before coercing it to units:
#' 
#'  * The following is removed:
#'    * Leading/trailing white spaces
#'    * Square brackets and commas
#'    * A list of common prefixes
#'    * Double spaces are replaced by single spaces
#'    * Brackets around multiply symbol
#'  * The following is corrected/adjusted:
#'    * 'for' is interpreted as multiplication
#'    * Scientific notation of numbers is
#'      standardised where possible.
#'    * A list of ambiguous patterns is replaced with
#'      more explicit strings. For instance,
#'      'deg' is replaced with 'degree'.
#'  * The following miscellaneous corrections are made:
#'    * A list of 'known' annotations are removed from the units
#'    * A list of elements kown to represent counts are renamed
#'      'counts'.
#'    * Percentages are renamed as explicit concentration in
#'      mass per volume or volume per volume units where possible
#'    * 'CI' is renamed 'Curies'.
#'    * 'M' is renamed 'mol/L'.
#'    * Units expressed as 'parts per ...' are explicitly renamed
#'      to mass over volume, or volum over volume where possible
#'  * Type specific sanitation steps
#'    * Concentration units:
#'      * 'K' is renamed 'Karmen'
#'      * 'dpm' is renamed 'counts/min' (i.e., disintegrations per minute)
#'    * Media units:
#'      * 'K' is renamed 'Kelvin'
#'      * 'C' is renamed 'Celsius'
#'  * Some final miscellaneous adjustments:
#'    * Scientific notation in numbers is not supported by the units package.
#'      Numbers are formatted in decimal notation where possible.
#'    * Spaces are removed if preceded by numeric and followed by
#'      alphabetical character
#'    * All equivalents of ambiguous synonyms for time units are explicitly
#'      renamed to their respective unit (e.g., 'dph' (days post hatching) -> 'day')
#'    * unreported/missing units are renamed 'unit'
#' 
#' It is your own responsibility to check if the sanitising steps are appropriate for
#' your analyses.
#' @param x A vector of `character` strings. It expects fields as commonly returned
#' from the ECOTOX database.
#' @param type The type of unit that can help the sanitation process. See the 'usage'
#' section for available options. These options are linked to the different unit tables
#' in the database (see `vignette("ecotox-schema")`). It can help to interpret ambiguous
#' units correctly. For instance, 'dpm' can both mean 'disintegrations per minute'
#' (`type = "concentration"`) and 'days post-moult' (`type = "duration"`).
#' @param ... Ignored.
#' @param warn If set to `FALSE` warnings while converting text to units are suppressed.
#' @returns A vector of `?units::unit` class objects with the same length as `x`.
#' @author Pepijn de Vries
#' @examples
#' ## Try parsing a random set of units from the database:
#' c("ppm-d", "ml/2.5 cm eu", "fl oz/10 gal/1k sqft", "kg/100 L",
#'   "mopm", "ng/kg", "ug", "AI ng/g", "PH", "pm", "uM/cm3", "1e-4 mM",
#'   "degree", "fs", "mg/TI", "RR", "ug/g org/d", "1e+4 IU/TI", "pg/mg TE",
#'   "pmol/mg", "1e-9/l", "no >15 cm", "umol/mg pro", "cc/org/wk", "PIg/L",
#'   "ug/100 ul/org", "ae mg/kg diet/d", "umol/mg/h", "cmol/kg d soil",
#'   "ug/L diet", "kg/100 kg sd", "1e+6 cells", "ul diet", "S", "mmol/h/g TI",
#'   "g/70 d", "vg", "ng/200 mg diet", "uS/cm2", "AI ml/ha", "AI pt/acre",
#'   "mg P/h/g TI", "no/m", "kg/ton sd", "ug/g wet wt", "AI mg/2 L diet",
#'   "nmol/TI", "umol/g wet wt", "PSU", "Wijs number") |>
#'   as_unit_ecotox(warn = FALSE)
#' 
#' ## Adding the type of measurement can affect interpretation:
#' as_unit_ecotox(c("C", "K"), type = "concentration")
#' as_unit_ecotox(c("C", "K"), type = "media")
#' @family ecotox-sanitisers
#' @export
as_unit_ecotox <- function(
    x,
    type = c("concentration", "duration", "length", "media", "application", "size", "weight", "unknown"),
    ..., warn = TRUE) {
  if (inherits(x, "mixed_units")) return(x)
  if (typeof(x) != "character") stop(
    paste("`as_unit_ecotox` should only convert `characters`.",
          "I got", typeof(x), "instead."))
  
  type <- rlang::arg_match(type)
  # Declare variables to pass CRAN checks
  .data <- NULL
  
  fun <- if (warn) \(x) x else suppressWarnings
  
  x <-
    dplyr::tibble(code = x) |>
    dplyr::mutate(
      ## trim leading and trailing white spaces
      code          = trimws(.data$code),
      
      ## remove square brackets and commas
      code          = gsub("\\[|\\]|,", "", .data$code),
      
      ## remove known prefixes
      code          = gsub("(?<=^|[ ])(ae|AE|ai|AI|fl|litter) ", "",
                           .data$code, perl = TRUE),
      
      ## double spaces to single spaces
      code          = gsub("  ", " ", .data$code),
      
      ## remove spaces around multiply symbol
      code          = gsub(" ?[*] ?", "*", .data$code),

      ## interpret 'for' as multiplication
      code          = gsub(" for ", "*", .data$code),
      
      ## minus preceded by alphabetical (not 'e') and followed by numeric
      ## should be an exponent
      code          = gsub("(?<=[a-d|f-z|A-D|F-Z])-(?=[0-9])", "^-", .data$code, perl = TRUE),
      
      ## standardise scientific notation in numeric component
      code          = gsub(" ?x10x(?=[0-9])", "1e", .data$code, perl = TRUE),
      code          = gsub("10[xX]", "1e", .data$code),
      
      ## numeric followed by 'k' is a thousandfold
      code          = gsub(sprintf("(?<=[0-9])k%s", .ut_sep2), "000", .data$code, perl = TRUE),
      
      ## replace ambiguous patterns with more explicit strings
      code          = {
        result <- .data$code
        for (i in seq_len(nrow(.ut_explicit))) {
          result <- .replace_ut_frag(
            .ut_explicit$pattern[[i]],
            .ut_explicit$replacement[[i]],
            result)
        }
        result
      },

      ## remove various annotations (space followed by one of 'known' annotations)
      code          = gsub(sprintf(" (%s)%s", paste(.ut_suffix, collapse = "|"), .ut_sep2), "",
                           .data$code, perl = TRUE),
      
      ## make counts explicit
      code          = .replace_ut_frag(sprintf("%s", paste(.ut_counts, collapse = "|")),
                                       "counts", .data$code),
      
      ## percentage preceded by decade of grams is in the mass unit per decilitre
      ## which is more specific
      code          = gsub("g([ ]?)%( w/v)?", "g/dL", .data$code),
      
      ## percentage concentration mass over volume is more explicitly
      ## expressed as hectogram per millilitre
      code          = gsub("^%([ ]?)(m|w)(:|/)v$", "g/dL", .data$code),
      
      ## percentage concentration volume over volume is more explicitly
      ## expressed as litre per hectolitre
      code          = gsub("^%([ ]?)v(:|/)v$", "L/hL", .data$code),
      
      ## percentage concentration mass over mass is more explicitly
      ## expressed as grams per hectograms
      code          = gsub("^%([ ]?)(m|w)(:|/)(m|w)$", "g/hg", .data$code),
      
      ## percentage concentration volume over mass is more explicitly
      ## expressed as grams per decalitre
      code          = gsub("^%([ ]?)v(:|/)(m|w)$", "g/dL", .data$code),

      ## CI, mCI and uCI are Curies (milli and micro)
      code          = gsub(sprintf("(?<=^(u|m|c)|%s(u|m|c))CI%s", .ut_sep0, .ut_sep2),
                           "Curies", .data$code, perl = TRUE),
      
      ## mM, uM, M is mmol/L, umol/L and mol/L respectively
      code          = gsub(sprintf("(?<=^(u|m)|%s(u|m|c))M%s", .ut_sep0, .ut_sep2),
                      "mol/L", .data$code, perl = TRUE),
      code          = .replace_ut_frag("M", "mol/L", .data$code),
      
      ## in 'mol+', the '+' is just an annotation of positive ions
      code          = gsub("mol+", "mol", .data$code, fixed = TRUE),

      ## Da (Dalton) is same as unified atomic mass (u)
      code          = gsub(sprintf("((?<=k)?)Da%s", .ut_sep2), "u", .data$code, perl = TRUE),
      
      ## mho is same as Siemens
      code          = gsub("(mho(s?))", "Siemens", .data$code, perl = TRUE),
      
      ## 'parts per ...' followed by minus, should be product (i.e, ppt-h should be ppt*h)
      code          = gsub("(?<=pp(h|t|m|b))-", "*", .data$code, perl = TRUE),
      ## make ppt and ppm more explicit where possible
      ## ppmw/soil vol === ppm w/v === ppmv
      code          = gsub("(ppmw/soil vol)|(ppm w/v)", "mg/L", .data$code),
      code          = gsub("ppt v/v", "mL/L", .data$code),
      code          = gsub("ppt w/v", "g/L", .data$code),
      ## ppmw === ppm w/w === ppm dw
      code          = gsub("(ppm dw)|(ppm w/w)|(ppmw)", "ug/g", .data$code),
      code          = gsub("(ppt w/w)|(pptw)", "mg/g", .data$code),
      ## In case of no annotation assume weight over volume
      code          = gsub("^ppb$", "ug/L", .data$code),
      code          = gsub("^ppm$", "mg/L", .data$code),
      code          = gsub("^ppt$", "g/L", .data$code),
      code          = gsub("^pph$", "g/dL", .data$code),
      
      ## 'type' specific sanitation steps
      code          = if (type == "concentration") {
        
        result <- .data$code
        ## in case of type 'concentration' 'K' stands for Karmen units
        result <- .replace_ut_frag("K", "Karmen", result)
        ## in case of concentration dpm is disintegrations (counts) per minute
        result <- .replace_ut_frag("(c|d)pm", "counts/min", result)
        ## in case of 'percent' and type 'concentration' it is assumed to
        ## be weight over volume
        result <- gsub("^[%]$", "dg/L", result)
        result
        
      } else          if (type == "media") {
        
        result <- .data$code
        ## in case of type 'media' 'C' stands for Celsius  
        result <- .replace_ut_frag("C", "Celsius", result)
        ## in case of type 'media' 'K' stands for Kelvin  
        result <- .replace_ut_frag("K", "Kelvin", result)
        result
        
      } else .data$code,
      
      code          = {
        ## scientific notation is not always parsed correctly by units package.
        ## In fact it's not even supported:
        ## https://github.com/r-quantities/units/issues/383#issuecomment-2703308481
        ## Therefore, the scientific notation is replaced by decimal notation where
        ## possible (abs(log10(number)) < 9)
        num <-
          stringr::str_extract_all(.data$code, "\\d+[.]?\\d*e[-|+]?\\d+") |>
          lapply(as.numeric) |>
          lapply(\(y) {
            if (length(y) == 0) return("")
            format(y, scientific = abs(log10(y)) >= 9)
          }) |>
          unlist()
        stringr::str_replace_all(.data$code, "\\d+[.]?\\d*e[-|+]?\\d+", num)
      },
      
      ## remove space if preceded by numeric and followed by alphabetical character
      code          = gsub("(?<=[0-9]) (?=[a-z|A-Z])", "", .data$code, perl = TRUE),
      
      ## rename all equivalents of 'seconds' to 'second'
      code          = gsub(
        sprintf("%s(%s)%s", .ut_sep1, paste(.ut_second, collapse = "|"), .ut_sep2), "second",
        .data$code, perl = TRUE),

      ## rotations per minute
      code          = .replace_ut_frag("rpm", "counts/min", .data$code),
      
      ## rename all equivalents of 'minutes' to 'minute'
      code          = gsub(
        sprintf("%s(%s)%s", .ut_sep1, paste(.ut_minute, collapse = "|"), .ut_sep2), "minute",
        .data$code, perl = TRUE),

      ## rename all equivalents of 'hours' to 'hour'
      code          = gsub(
        sprintf("%s(%s)%s", .ut_sep1, paste(.ut_hour, collapse = "|"), .ut_sep2), "hour",
        .data$code, perl = TRUE),
      
      ## rename all equivalents of 'day' to 'day'
      code          = gsub(
        sprintf("%s(%s)%s", .ut_sep1, paste(.ut_day, collapse = "|"), .ut_sep2), "day",
        .data$code, perl = TRUE),
      
      ## rename all equivalents of 'weeks' to 'week'
      code          = gsub(
        sprintf("%s(%s)%s", .ut_sep1, paste(.ut_week, collapse = "|"), .ut_sep2), "week",
        .data$code, perl = TRUE),
      
      ## rename all equivalents of 'months' to 'month'
      code          = gsub(
        sprintf("%s(%s)%s", .ut_sep1, paste(.ut_month, collapse = "|"), .ut_sep2), "month",
        .data$code, perl = TRUE),
      
      ## rename all equivalents of 'years' to 'year'
      code          = gsub(
        sprintf("%s(%s)%s", .ut_sep1, paste(.ut_year, collapse = "|"), .ut_sep2), "year",
        .data$code, perl = TRUE),
      
      ## Use 'unit' by default as arbitrary unit
      code          = gsub("^(|--|NR|NC)$", "unit", .data$code)
    )
  
  result <-
    x |>
    dplyr::distinct() |>
    dplyr::mutate(
      unit = {
        y <-
          lapply(
            .data$code,
            \(y) {
              tryCatch({
                units::mixed_units(1, y) |> fun()
              }, error = function(e) units::mixed_units(1, "unit"))
            }) |>
          fun()
        do.call(c, y)
      }
    )
  
  dplyr::left_join(x, result, "code") |>
    dplyr::pull("unit")
}

#' Convert mixed units to a specific unit
#' 
#' Converts a list of mixed units to a specific unit, using the `units`
#' package.
#' @param x A mixed units object (`units::mixed_units()`) to be converted
#' to the `target_unit`
#' @param target_unit A `character` string representing the target
#' unit
#' @returns Returns a units object (`?units::units`). Values with units
#' that cannot be converted to the `target_unit` is returned as `NA`.
#' @examples
#' mishmash <- as_unit_ecotox(c("mg/L", "ppt w/v", "% w/v", "mmol/L"))
#' 
#' ## Note that 'mmol/L' cannot be converted to 'ug/L'
#' ## without a molar mass. It is returned as `NA`
#' mixed_to_single_unit(mishmash, "ug/L")
#' 
#' mishmash <- as_unit_ecotox(c("h", "sec", "mi", "dph"))
#' 
#' mixed_to_single_unit(mishmash, "h")
#' @author Pepijn de Vries
#' @family ecotox-sanitisers
#' @export
mixed_to_single_unit <- function(x, target_unit) {
  result <-
    lapply(x, \(y)
           tryCatch(units::set_units(units::mixed_units(y), target_unit)[[1]],
                    error = function(e) units::as_units(NA, target_unit))
    )
  do.call(c, result)
}
Any scripts or data that you put into this service are public.
ECOTOXr documentation built on April 12, 2025, 1:27 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
ECOTOXr
Download and Extract Data from US EPA's ECOTOX Database

R/process_unit.r
In ECOTOXr: Download and Extract Data from US EPA's ECOTOX Database

Defines functions mixed_to_single_unit as_unit_ecotox process_ecotox_units .field_to_unit_type .replace_ut_frag

Documented in as_unit_ecotox mixed_to_single_unit process_ecotox_units

Try the ECOTOXr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ECOTOXr Download and Extract Data from US EPA's ECOTOX Database

R/process_unit.r In ECOTOXr: Download and Extract Data from US EPA's ECOTOX Database

Defines functions mixed_to_single_unit as_unit_ecotox process_ecotox_units .field_to_unit_type .replace_ut_frag

Documented in as_unit_ecotox mixed_to_single_unit process_ecotox_units

Try the ECOTOXr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ECOTOXr
Download and Extract Data from US EPA's ECOTOX Database

R/process_unit.r
In ECOTOXr: Download and Extract Data from US EPA's ECOTOX Database