R/phenodata.R

Defines functions extractPhenoFromHTML .doExtract .textAtIdx

#' @param x An R object to coerce to numeric.
#' @value Returns the value of `as.numeric(x)` if the call to as.numeric does not throw an error, otherwise returns `NA`.
.numericOrNA < function(x)
  tryCatch(as.numeric(x), error = NA)

#' @param x A single "xml_node" object holding a single HTML <tr> row.
#' @param i The single integer specifying the <td> cell index from argument `x` to extract.
#' @value Character
.textAtIdx <- function(x, i, j)
  xml_text(xml_find_one(x, sprintf('//td[%s]', i, j)))

#' @param res A single "xml_node" object holding the scraped Kannapedia HTML for a single RSP identifier (single strain).
#' @details Signature for the `res` argument: `assert(inherits(res, 'html_node') | inherits(res, 'xml_node')))`
.doExtract <- function(res = NULL) {
  if (is.null(res))
    return(
      list(
        chemprofile = c(
          cannabinoids = NA,
          terpenoids   = NA
        ),
        pctheterozy = NA,
        gendistance = NA
      )
    )

  .getP <- function(block)
    .numericOrNA(xml_text(block))

  .getG <- function(block)
    lapply(
      xml_find_all(block, '//tbody/tr[position()>1]'),
      function(tr) {
        list(
          position = .textAtTd(tr, 1) %>% .numericOrNA,
          distance = .textAtTd(tr, 3) %>% .numericOrNA,
          relative = .textAtTd(tr, 2) %>% .numericOrNA
        )
      }
    )

  .getC <- function(block)
    list(
      'THC/A' = xml_find_one(block, '//p/strong[2]/em') %>% .numericOrNA,
      'THCV/A' = xml_find_one(block, '//p/strong[4]/em') %>% .numericOrNA,
      'CBD/A' = xml_find_one(block, '//p/strong[3]/em') %>% .numericOrNA,
      'CBN/A' = xml_find_one(block, '//p/strong[7]/em') %>% .numericOrNA,
      'CBC/A' = xml_find_one(block, '//p/strong[5]/em') %>% .numericOrNA,
      'CBG/A' = xml_find_one(block, '//p/strong[6]/em') %>% .numericOrNA
    )

  .getT <- function(block)
    list(
      'alpha-Bisabolol' = xml_find_one(block,'//strong[1]') %>% .numericOrNA,
      'Borneol' = xml_find_one(block,'//strong[2]') %>% .numericOrNA,
      'Camphene' = xml_find_one(block,'//strong[3]') %>% .numericOrNA,
      'Carene' = xml_find_one(block,'//strong[4]') %>% .numericOrNA,
      'Caryophyllene oxide' = xml_find_one(block,'//strong[5]') %>% .numericOrNA,
      'beta-Carophyllene' = xml_find_one(block,'//strong[6]') %>% .numericOrNA,
      'Fenchol' = xml_find_one(block,'//strong[7]') %>% .numericOrNA,
      'Geraniol' = xml_find_one(block,'//strong[8]') %>% .numericOrNA,
      'alpha-Humulene' = xml_find_one(block,'//strong[9]') %>% .numericOrNA,
      'Limonene' = xml_find_one(block,'//strong[10]') %>% .numericOrNA,
      'Linalool' = xml_find_one(block,'//strong[11]') %>% .numericOrNA,
      'Myrcene' = xml_find_one(block,'//strong[1]') %>% .numericOrNA,
      'alpha-Phellandrene' = xml_find_one(block,'//strong[2]') %>% .numericOrNA,
      'Terpinolene' = xml_find_one(block,'//strong[3]') %>% .numericOrNA,
      'alpha-Terpineol' = xml_find_one(block,'//strong[4]') %>% .numericOrNA,
      'alpha-Terpinene' = xml_find_one(block,'//strong[5]') %>% .numericOrNA,
      'gamma-Terpinene' = xml_find_one(block,'//strong[6]') %>% .numericOrNA,
      'Total Nerolidol' = xml_find_one(block,'//strong[7]') %>% .numericOrNA,
      'Total Ocimene' = xml_find_one(block,'//strong[8]') %>% .numericOrNA,
      'alpha-Pinene' = xml_find_one(block,'//strong[9]') %>% .numericOrNA,
      'beta-Pinene' = xml_find_one(block,'//strong[10]') %>% .numericOrNA
    )

  list(
    chemprofile = c(
      terpenoids = c(
        xml_find_first(content(res, 'parsed'), '//*[@id="x-section-2"]/div[1]/div[1]/div[2]/ul/li[1]') %>% .getT,
        xml_find_first(content(res, 'parsed'), '//*[@id="x-section-2"]/div[1]/div[1]/div[2]/ul/li[2]') %>% .getT
      ) %>% merge,
      cannabinoids = xml_find_first(content(res, 'parsed'), '//*[@id="x-section-2"]/div[1]/div[1]/div[1]') %>% .getC,
    ),
    pctheterozy = xml_find_first(content(res, 'parsed'), '//*[@id="x-section-2"]/div[1]/div[2]/div/strong[1]/em') %>% .getP,
    gendistance = xml_find_first(content(res, 'parsed'), '//*[@id="x-section-2"]/div[2]/table') %>% .getG
  )
}

#' @example `extractMetabolitesFromHTML(read_html(content("<...>", "parsed"), as_html = T))`
#' @export
extractPhenoFromHTML <-
  function(html, callback = function(lab, val) c(label=lab, value=val), .doExtract)
{
  .check <- function(htm)
    list(
      isHttr = (isHttr <- inherits(htm, 'response')),
      isXml2 = (isXml2 <- inherits(htm, 'html_node') | inherits(htm, 'xml_node')),

      isParsed = (isParsed <- any(isHttr, isXml2)),
      isCharv = (isCharv <- !isParsed && inherits(htm, 'character')),

      isFile = (isFile <- inherits(htm, 'file') | inherits(htm, 'connection')),
      isPath = (isPath <- isCharv & !isFile && file.exists(htm)),

      isTypeError = (isTypeError <- !any(isParsed, isFile, isCharv))
    )

  res <- .doExtract(NULL) # Initialize with NA's

  if (.check(html)['isTypeError'])
    stop('Argument "html" must inherit from one of the following classes: "response", "html_node", "xml_node", "connection", "file", or "character".')

  if (.check(html)['isCharv'])
    html <- read_html(html, as_html = T)
  if (.check(html)['isPath'])
    html <- file(html)
  if (.check(html)['isFile'])
    html <- read_html(html, as_html = T)
  if (.check(html)['isParsed'])
    if (.check(html)['isHttr'])
      res <- content(html, 'parsed')
    if (.check(html)['isXml2'])
      res <- read_html(html, as_html = T)

  .doExtract(res)
}
Indicai-dev/kannarip documentation built on Sept. 2, 2020, 12:36 p.m.