R/dataPatch.R

Defines functions aHref getRandomNumber getPageCached stringInteractorsCounter keggPathWayCounter getCompoundList xmlUniProt fdaLabel downloadEMAFile readReport ema pubMed statusSourceHTML consolidateColumns checkDataCoverage patch main

##
## dataPatch.R: supply missing data items for clue.io compounds -----------
##

## Usage: call the main() from command line:
## R -e 'source("R/dataPatch.R"); main()'

##
## Libraries --------------------------------------------------------------
##
library(assertthat)
library(dplyr)
library(glue)
library(httr)
library(readr)
library(rvest)
library(stringr)
library(xml2)

set.seed(739)
options(width = 160)

##
## Settings of global variables -------------------------------------------
##
SLEEP_TIME <- 30 # waiting time between two HTTP requests in seconds
INGREDIENT_FILTER <- FALSE # Is FDA Label API request strict or not?
OUTPUT <- "OUTPUT"
CACHE <- glue::glue("{OUTPUT}/DATAPATH_CACHE")
CLUE.INPUT <- glue::glue("{OUTPUT}/clue.rds")
# CLUE.PATCHED.OUTPUT <- glue::glue("{OUTPUT}/clue_patched.tsv")
CLUE.PATCHED.OUTPUT <- glue::glue("{OUTPUT}/clue_patched.rds")
TARGET_LIST.RDS <- glue::glue("{OUTPUT}/targetList.rds")

TOOL.NAME <- "https://github.com/cycle20/scancer/"

PUBMED.BASE <- "https://pubmed.ncbi.nlm.nih.gov/"
PUBMED.SEARCH <- paste0(
  PUBMED.BASE,
  "?term={compound}&size=200", # "compound" as glue::glue variable
  "&filter=pubt.clinicaltrial","&filter=pubt.meta-analysis",
  "&filter=pubt.randomizedcontrolledtrial",
  "&filter=pubt.review",
  "&filter=pubt.systematicreview"
)
PUBMED.RESULT.XPATH <- paste0(
  '/html/body/main//section[@class="search-results-list"]',
  '//article[position() < 4]//a'
)
# maximum number of displayed PubMed links per compound
PUBMED.DISPLAY.MAX_LINKS <- 3

UNIPROT.HTML.TEMPL = "https://www.uniprot.org/uniprot/{id}"
UNIPROT.XML.TEMPL = "https://www.uniprot.org/uniprot/{id}.xml"

EMA.XLSX.FILE <- "Medicines_output_european_public_assessment_reports.xlsx"
EMA.PRODUCT.INFORMATION.URL <- paste0(
  'https://www.ema.europa.eu/en/documents/product-information',
  '/{urlName}-epar-product-information_en.pdf'
)

##
## Functions --------------------------------------------------------------
##

#' Main function
#'
#' Load data supply missing values.
#' In some complex cases it do some HTML pre-formatting to
#' support a simple HTML page rendering solution.
#'
#' @return Patched data.frame
main <- function() {
  ## read curated input used by clue.R
  targetList <- readRDS(TARGET_LIST.RDS)

  clueTable <- readRDS(CLUE.INPUT) %>%
    ## NOTE: this filter drops some not well-curated compounds
    filter(!is.na(source))

  checkDataCoverage(clueTable)

  clueTable <- targetList %>%
    left_join(clueTable) %>%
    rowwise() %>%
    mutate(has_data = (!is.na(pert_iname) && !is.na(UNIPROT_KB_ID))) %>%
    filter(has_data == TRUE)

  result <- patch(clueTable)

  ## save tibble as RDS since write_tsv is not an obvious way
  print(result)
  saveRDS(result, file = CLUE.PATCHED.OUTPUT)

  # TODO: should be another logic: checkDataCoverage(result)

  warnings()
  return(result)
}


#' "Patch" data gaps in CLUE table
#'
#' @param clueTable
#'
#' @return Invisible NULL
patch <- function(clueTable) {

  if (nrow(clueTable) == 0) {
    stop("There are no rows in the input table")
  }

  ## FDA searches
  clueTable <- fdaLabel(clueTable)
  ## PubMed searches
  clueTable <- pubMed(clueTable)
  ## EMA searches
  clueTable <- ema(clueTable)
  ## Append UniProt details
  clueTable <- xmlUniProt(clueTable)
  ## Add count of KEGG pathways and STRING interactors per target
  clueTable <- keggPathWayCounter(clueTable)
  clueTable <- stringInteractorsCounter(clueTable)

  ## Adjustment of data of columns and HTML fragments
  clueTable <- consolidateColumns(clueTable)

  ## return updated tables
  return(clueTable)
}


#' Check gaps in our data set
#'
#' @param clueTable
#'
#' @return Invisible NULL
checkDataCoverage <- function(clueTable) {
  separator <- paste(rep("#", 75), collapse = "")
  print(glue::glue("\n\n{separator}"))
  print(glue::glue("{separator}", "  !!! START OF DATA INTEGRITY TEST !!!"))
  print(glue::glue("{separator}\n\n"))

  ## internal helper function
  checkTable <- function(table, msg, stop = FALSE) {
   n <- nrow(table)
   prefix <- glue::glue("\n\n>>>>>>>> {msg}: {n} ...")
   if (nrow(table) > 0) {
     print(glue::glue("{prefix} IS NOT OK!"))
     print(table)
     if (stop) stop("Unexpected data state")
   } else
     print(glue::glue("{prefix} IS OK!"))
  }

  ## FDA Orange issue
  distinctTable <- clueTable %>%
    select(
      pert_iname,
      # moa,
      final_status,
      status_source,
      orange_book
    ) %>%
    distinct() %>%
    rowwise() %>%
    filter(grepl("FDA Orange", status_source))
  checkTable(distinctTable, "FDA Orange")

  ## FDA Orange issue V2
  distinctTable <- clueTable %>%
    select(
      pert_iname,
      final_status,
      status_source,
      orange_book
    ) %>%
    distinct() %>%
    rowwise() %>%
    filter(!is.na(orange_book) && is.na(status_source))
  checkTable(distinctTable, "FDA.V2 orange_book has value")

  ## FDA Launched check
  distinctTable <- clueTable %>%
    select(
      pert_iname,
      final_status,
      status_source,
      orange_book
    ) %>%
    distinct() %>%
    rowwise() %>%
    filter(final_status == "Launched" &&
      is.na(orange_book) && is.na(status_source))
  checkTable(distinctTable, "FDA Launched check")

  ## PubChem/ChEMBL check
  distinctTable <- clueTable %>%
    select(
      pert_iname,
      final_status,
      pubchem_cid,
      chembl_id,
      inchi_key,
      pert_id,
      ttd_id,
      drugbank_id,
      source,
      status_source
    ) %>%
    distinct() %>%
    rowwise() %>%
    filter(is.na(pubchem_cid) && is.na(chembl_id))
  checkTable(distinctTable, "PubChem/ChEMBL check")

  distinctTable <- clueTable %>%
    select(final_status, status_source) %>%
    distinct() %>%
    filter(final_status == "Preclinical" && !is.na(status_source))
  checkTable(distinctTable, "Preclinical status_source", stop = TRUE)

##  print("Foreced quit")
##  quit(save = "no")


  print(glue::glue("{separator}"))
  print(glue::glue("{separator}", "  !!! END OF DATA INTEGRITY TEST !!!"))
  print(glue::glue("{separator}"))

  return(invisible(NULL))
}


#' Cross-checking columns
#'
#' @param clueTable Input dataframe.
#'
#' @return data.frame updated based on column-wide decisions.
consolidateColumns <- function(clueTable) {
  ##
  ## TODO: Where should be constructed the HTML code?
  ##       Template file would be the best place.
  ##


  ## FUNC: to compose PubMed link
  pubMedLink <- function(PMID) {
    link <- paste0(PUBMED.BASE, PMID)
    return(aHref(link, glue::glue("PubMed {PMID}")))
  }
  ## FUNC: returns an HTML string of PubMed links
  pubMedLinks <- function(PMIDs, compound) {

    title <- "<strong>From PubMed:</strong>"
    if(all(is.na(PMIDs))) {
      searchLinkHTML <- aHref(link = glue::glue(PUBMED.SEARCH),
        titleText = "No search result"
      )
      return(paste(c(title, searchLinkHTML), collapse = "<br />"))
    } else {
      assertthat::assert_that(is.vector(PMIDs))

      PMIDs <- head(PMIDs, n = PUBMED.DISPLAY.MAX_LINKS)
      htmlLinks <- paste(sapply(PMIDs, pubMedLink), collapse = "<br />")
      return(paste(c(title, htmlLinks), collapse = "<br />"))
    }
  }

  ## FUNC: to compose EMA and append link to "status_source"
  appendEMALinks <- function(status_source = "", emaProduct, emaLink) {
    if(is.na(emaLink)) {
      return(status_source)
    } else {
      status_source <- paste(
        c(status_source, "<strong>From EMA:</strong>"),
        collapse = "<br />"
      )

      # Example:
      # https://www.ema.europa.eu/en/medicines/human/EPAR/glyxambi
      # https://www.ema.europa.eu/en/documents/overview/glyxambi-epar-summary-public_en.pdf
      # or
      # https://www.ema.europa.eu/en/documents/product-information/glyxambi-epar-product-information_en.pdf
      urlName <- stringr::str_remove(emaLink, '^https:.+/')
      emaLink <- glue::glue(EMA.PRODUCT.INFORMATION.URL)
      status_source <- paste(
        c(
          status_source,
          aHref(emaLink, glue::glue('{emaProduct} (PDF)'))
        ),
        collapse = "<br />"
      )

      return(status_source)
    }
  }

  ## FUNC: to compose FDA and append links to "status_source"
  appendFDALinks <- function(status_source = "", fdaLabelDetails) {
    if (is.na(fdaLabelDetails$pert_iname)) {
      return(status_source)
    } else {
      ## old value + title of FDA "section":
      status_source <- paste(
        c(status_source, "<strong>FDA Labels:</strong>"),
        collapse = "<br />"
      )

      ## glue variable accessed from dplyr environment: "setId"
      ## TODO: https://nctr-crs.fda.gov/fdalabel...spl-doc?hl=PATTERN_TO_HIGHLIGHT
      labelURLTemplate <-
        "https://nctr-crs.fda.gov/fdalabel/services/spl/set-ids/{setId}/spl-doc"
      ## glue variable accessed from dplyr environment: "productNames"
      anchorTemplate <- '<a href="{labelURL}" target="_blank">{productNames}</a>'

      ## concatenate labels as HTML URL list
      htmlOfURLs <- fdaLabelDetails$productsTable %>%
        mutate(
          labelURL = glue::glue(labelURLTemplate),
          htmlURL = glue::glue(anchorTemplate)
        ) %>%
        select(labelURL, htmlURL)

      resultHTML <- htmlOfURLs %>% pull(htmlURL)
      resultHTML <- paste(resultHTML, collapse = "<br />")

      status_source <- paste(c(status_source, resultHTML), collapse = "<br />")
      return(status_source)
    }
  }

  ## update "status_source" by HTML representation of various sources
  clueTable <- clueTable %>%
    rowwise() %>%
    mutate(
      pubMedPreClinicalLinks = pubMedLinks(pubMedPreClinicalLinks, pert_iname),
      fdaSearchResults = appendFDALinks(pubMedPreClinicalLinks,
                                        fdaLabelDetails = fdaSearchResults),
      emaLinks = appendEMALinks(fdaSearchResults, emaProduct, emaLink = emaLinks)
    ) %>%
    rowwise() %>%
    mutate(clueSource = glue::glue(
      "<div class='clue'><strong>CLUE.IO</strong><br/>",
      "{statusSourceHTML(status_source, pert_iname)}</div>"
    )) %>%
    mutate(
      status_source = paste(c(emaLinks, clueSource), collapse = "<br />")
    )

  return(clueTable)
}


#' Represent sources as hyperlinks
#'
#' @param statusSource character if it is an URL, it points
#' probably to ClinicalTrials; but other URLs and pure texts
#' can be expected here as well. This function verifies the source
#' value and transform it the most appropriate HTML string.
#' @param pert_iname name of the perturbagen
#'
#' @return HTML string
statusSourceHTML <- function(statusSource, pert_iname) {
  if (is.na(pert_iname) || is.null(pert_iname)) {
    return(pert_iname)
  } else if(is.na(statusSource) || is.null(statusSource)) {
    return("No status source")
  } else if (stringr::str_starts(statusSource, "<a href=")) {
    ## already converted HTML
    return(statusSource)
  }

  htmlText <- ""
  label <- if(stringr::str_starts(statusSource,
    pattern = "https?://.*clinicaltrials.gov/.+NCT[0-9]+")) {
    "ClinicalTrials"
  } else if (stringr::str_starts(statusSource,
    "https?://.*ncbi.*gov/pubmed")) {
    "PubMed"
  } else if (stringr::str_starts(statusSource, "https?://.+fda.gov/")) {
    "FDA"
  } else if (stringr::str_starts(statusSource,
    "https?://.*dailymed.*.gov/")) {
    "DailyMed"
  } else if (stringr::str_starts(statusSource, "https?://.*wikipedia.org/")) {
    "Wikipedia"
  } else if (stringr::str_starts(statusSource, "https?://www.drugs.com/")) {
    "drugs.com"
  } else if (stringr::str_starts(statusSource, "https?://.*springer.com/")) {
    "Springer"
  } else if (stringr::str_starts(statusSource, "https?://docslide.*/")) {
    "docslide"
  } else if (stringr::str_starts(statusSource, "https://guidebook.com/")) {
    "guidebook"
  } else if (stringr::str_starts(statusSource, "http")) {
    # default URL text
    "Unexpected Source"
  } else {
    ## plain text as is
    html <- statusSource
    return(html)
  }
  htmlText <- paste0(htmlText, aHref(link = statusSource, titleText = label))

  return(htmlText)
}


#' Extract and supply PMIDs/links
#'
#' @param clueTable
#'
#' @return
pubMed <- function(clueTable) {

  ## pubMed search request and extract results
  pubMedSearch <- function(compound) {

    ## download page or read it from cache
    searchURL <- glue::glue(PUBMED.SEARCH)

    result <- getPageCached(searchURL, sleepTime = SLEEP_TIME)

    ## XPath expression to access ID list embedded into HTML code
    xpath <- paste0(
      'string(/html/head/meta[@name = "log_displayeduids"]/@content)'
    )
    ## get article IDs: single string separated by "," and
    ## split the ID list
    articleIds <- result$document %>%
      rvest::html_element(xpath = xpath) %>%
      stringr::str_split(pattern = ",") %>%
      unlist()

    articleIds <- if_else(articleIds[1] == "",
      list(NULL), list(articleIds)) %>%
      unlist()

    return(articleIds)
  }

  ## get link for a compound only once: list of unique compound names
  compoundList <- getCompoundList(clueTable)
  ## apply search on each compound
  compoundList <- sapply(compoundList, pubMedSearch, simplify = FALSE)

  addPubMedData <- function(.data) {
    .data <- .data %>%
      mutate(pubMedPreClinicalLinks = list(compoundList[[pert_iname]])) %>%
      mutate(PubMedCounter = length(unlist(pubMedPreClinicalLinks)))
    return(.data)
  }

  clueTable <- clueTable %>% addPubMedData()

  # TODO: PMC? And embargoed articles? https://www.ncbi.nlm.nih.gov/pmc
  # TODO: https://drugs.ncats.io/substances?facet=Pharmacology%2FInhibitor
  # TODO: https://drugs.ncats.io/drug/MRK240IY2L

  return(clueTable)
}


##
## Labels for Launched Drugs/Compounds ------------------------------------
##

#' Supply details from EMA website
#'
#' @param clueTable Input dataframe with compounds (pert_iname column).
#'
#' @return data.frame patched by EMA links.
ema <- function(clueTable) {

  reportTable <- readReport(EMA.XLSX.FILE)

  ## Compare
  compare <- function(compound1, compound2) {
    pattern <- "[()+]"
    compound1 <- stringr::str_remove_all(compound1, pattern)
    compound2 <- stringr::regex(
      stringr::str_remove_all(compound2, pattern),
      ignore_case = TRUE
    )
    return(stringr::str_detect(compound1, compound2))
  }

  ## Get results of query on EMA search page
  emaSearch <- function(compound) {

    print(glue::glue("EMA SEARCH: compound: {compound}"))
    reportTable <- reportTable %>%
      dplyr::filter(compare(`Active substance`, compound))

    if (nrow(reportTable) == 0) {
      return(list(
        url = NA_character_,
        productName = NA_character_
      ))
    }

    # pick the first URL
    return(list(
      url = reportTable$URL[1],
      productName = reportTable$`Medicine name`[1]
    ))
  }

  ## get link for a compound only once: list of unique compound names
  compoundList <- getCompoundList(clueTable)
  ## apply search on each compound
  compoundList <- sapply(compoundList, emaSearch, simplify = FALSE)

  ## update table with EMA links
  clueTable <- clueTable %>%
    dplyr::rowwise() %>%
    dplyr::mutate(
      emaLinks = compoundList[[pert_iname]]$url,
      emaProduct = compoundList[[pert_iname]]$productName
    )

  return(clueTable)
}


readReport <- function(fileName) {
  reportFile <- glue::glue("{OUTPUT}/{fileName}")
  downloadEMAFile(fileName, destinationFile = reportFile)

  report <- readxl::read_excel(reportFile, skip = 7)
  return(report)
}

downloadEMAFile <- function(file, destinationFile = file, quiet = FALSE) {
  EMA_FILES_BASE_URL <- "https://www.ema.europa.eu/sites/default/files"
  url <- glue::glue("{EMA_FILES_BASE_URL}/{file}")
  if (!quiet) {
    print(glue::glue("Downloading {url}"))
  }
  curl::curl_download(url = url, destfile = destinationFile, mode ="wb", quiet = quiet)
}


#############################################################



fdaLabel <- function(clueTable) {
  ## url with "pert_iname" placeholder
  url = paste0("https://nctr-crs.fda.gov/fdalabel/services/spl/summaries",
               "?pert={pert_iname}&ingr=", INGREDIENT_FILTER)

  # DailyMed - NIH’s labeling search tool over 130,000 labeling documents for
  # prescription drugs (including biological products, vaccines, blood products,
  # cellular and gene therapy products), over-the-counter drugs, homeopathic
  # drugs, animal drugs, and other products.
  # FDALabel - FDA’s labeling search tool over 130,000 labeling documents.
  # FDALabel and DailyMed have the same database but have different search
  # functions and different displays of search results.

  # More resources: https://www.fda.gov/drugs/laws-acts-and-rules/prescription-drug-labeling-resources

  ## TODO: https://www.fda.gov/science-research/bioinformatics-tools/fdalabel-full-text-search-drug-product-labeling#Live%20Queries
  ## Biomarker “BRCA or BRAF”

  ## FUNC: Custom download function instead of rvest::read_html.
  ##       Passed as parameter of getPageCached function.
  ##       based on https://nctr-crs.fda.gov/fdalabel/ui/search results
  fdaDownload <- function(url) {
    if(startsWith(url, CACHE)) {
      ## if reading from cache...
      return(readr::read_file(url))
    }

    ## TODO: Ugly hack to resolve variable from the call-chain
    pert_iname <- get("pert_iname", envir = sys.frame(-2))
    if (!INGREDIENT_FILTER) {
      ## Simple search:
      ## ..............
      ## Labeling type: "Human Rx", "Human OTC"
      ## Labeling Full Text Search
      # postData <- paste0('{"criteria":[{"criteria":[',
      #   '{"sourceEntity":"document-type","documentTypeCodes":',
      #   '["34391-3","34390-5", "53404-0"]},{"sourceEntity":"spl-text",',
      #   '"textQuery":"',
      #   pert_iname,
      #   '","advanced":false}],"logOps":["a","a"]}],"logOps":[]}')
      postData <- paste0(
        '{"criteria":[{
          "criteria": [
            {
              "sourceEntity": "document-type",
              "documentTypeCodes": [
                "34391-3",
                "34390-5",
                "53404-0"
              ]
            },
            {
              "sourceEntity": "product",
              "nameType": "ANY",
              "namePatternType": "SUBSTR",
              "namePattern": "', pert_iname, '"
            }
          ],
          "logOps": ["a","a"]
        }],"logOps":[]}'
      )
    } else {
      ## Simple search: ingredient filter is ON
      ## ......................................
      postData <- paste0(
        '{"criteria":[{"criteria":[',
        '{"sourceEntity":"document-type","documentTypeCodes":',
        ## Labeling type: "Human Rx", "Human OTC", "Vaccine"
        '["34391-3","34390-5","53404-0"]},{"sourceEntity":"section",',
        '"textQuery":"',
        pert_iname,
        ## Labeling Section(s): "ACTIVE INGERDIENT"
        '","selectedLabelingType":"0","sectionTypeCode":"2-55106-9",',
        '"advanced":false}],"logOps":["a","a"]}],"logOps":[]}'
      )
    }

    ## compress by removing white spaces
    postData <- stringr::str_remove_all(postData, "[[:space:]]")
    # response <- httr::HEAD(url, body = postData, config = add_headers(
    #   "Content-Type" = "application/json; charset=UTF-8",
    #   "Accept" = "application/json"
    # ))

    ## send the POST request
    response <- httr::POST(url, body = postData, config = add_headers(
      "Content-Type" = "application/json; charset=UTF-8",
      "Accept" = "application/json"
    ))
    responseContent <- httr::content(response, as = "text", encoding = "UTF-8")

    return(responseContent)
  }

  ## FUNC: internal helper function
  getFDALabelResults <- function(pert_iname) {
    #labelTable <- jsonlite::read_json(glue::glue("{CACHE}/{pret_iname.json}"))

    ## "empty" result
    emptyList <- list(pert_iname = NA, productsTable = NA)

    ## call cached download, uses custom download function (fdaDownload) above
    downloadedResult <- getPageCached(
      glue::glue(url), ## replace pert_iname placeholder before pass it
      sleepTime = SLEEP_TIME,
      ## send and get HTTP request and result in a specialized manner
      downloadFunc = fdaDownload
    )

    ## parsing and "compressing" result set
    parsedResult <- jsonlite::fromJSON(downloadedResult$document)
    if (parsedResult$totalResultsCount == 0) {
      print(glue::glue("fdaLabel :: {pert_iname} :: results not found"))
      return(emptyList) ## return "empty" list
    }

    ## filtering
    pertNamePattern <- gsub("-", ".{,4}", pert_iname)
    products <- parsedResult$resultsArray %>%
      filter(
        !any(grepl("first aid", productNames, ignore.case = TRUE))
          && (!any(grepl(" kit", productNames, ignore.case = TRUE))
              && !any(grepl("KIT", dosageForms)))
      )

    if ((products %>% nrow()) == 0) {
      print(glue::glue("{pert_iname} :: results not found after filtering"))
      return(emptyList) ## return "empty" list
    }

    ## FUNC: Shrink found labels selecting by
    ##       max. market date of each group
    ingredientsMatchLevel <- Vectorize(function(pert_iname, ingrList) {
      ## to uppercase and split the list of ingredients name
      ingrList <- ingrList %>%
        stringr::str_to_upper() %>%
        stringr::str_split(string = ., pattern = "; *")
      ingrList <- ingrList[[1]]
      ## to uppercase the pert. name
      pert_iname <- stringr::str_to_upper(pert_iname)

      matchLevel <- if (pert_iname %in% ingrList) {
        1 # exact match
      } else if (any(stringr::str_starts(ingrList, pert_iname))) {
        2 # partial match A: starts with
      } else if (any(stringr::str_ends(ingrList, pert_iname))) {
        3 # partial match B: ends with
      } else if (any(grepl(pert_iname, ingrList))) {
        4 # partial match C: contains
      } else {
        5 # no match at all
      }
    })

    ## FUNC: split and returns actIngrUniis string
    ingrLength <- Vectorize(function(actIngrUniis) {
      actIngrUniis <- actIngrUniis %>%
        stringr::str_to_upper() %>%
        stringr::str_split(string = ., pattern = "; *")
      actIngrUniis <- actIngrUniis[[1]]

      return(length(actIngrUniis))
    })

    products <- products %>%
      mutate(matchLevel = ingredientsMatchLevel(pert_iname, actIngrNames)) %>%
      mutate(ingrLength = ingrLength(actIngrUniis)) %>%
      filter(matchLevel < 5) %>%
      group_by(matchLevel, ingrLength)

    if ((products %>% nrow()) == 0) {
      print(glue::glue("{pert_iname} :: matching ingredients not found"))
      print(glue::glue("{pert_iname} :: NOTE: there might be special matching"))
      return(emptyList) ## return "empty" list
    }

    ## probably "best" matches"
    ## TODO: needs more work: since it picks from each matchLevel,
    ##       but incremental selection would be better:
    ##       - first try to get results from level 1,
    ##       - then try level 2,...
    ##       - etc...
    products <- products %>%
      filter(marketDates == max(marketDates)) %>%
      ungroup() %>%
      arrange(matchLevel, ingrLength)

    nr <- nrow(products)
    percentage <- round((1 - nr / parsedResult$totalResultsCount) * 100, 2)
    print(glue::glue(
      "fdaLabel({pert_iname}) products counts ({nr}) shrinked by {percentage} %"
    ))

    productsTable <- (products %>% select(productNames, setId))
    return(list(
      pert_iname = pert_iname,
      productsTable = productsTable
    ))
  }

  ## get link for a compound only once: list of unique compound names
  compoundList <- getCompoundList(clueTable)
  ## apply search on each compound
  compoundList <- sapply(compoundList, getFDALabelResults, simplify = FALSE)

  ## transform status_source based on FDA results
  clueTable <- clueTable %>%
    # filter(final_status == "Launched" && !is.na(orange_book)) %>%
    mutate(fdaSearchResults = list(compoundList[[pert_iname]]))

  return(clueTable)
}


#' Supply details from UniProtKB
#'
#' @param clueTable Input dataframe with UNIPROT_KB_ID column.
#'
#' @return data.frame patched by UniProt details.
xmlUniProt <- function(clueTable) {

  ## change SLEEP_TIME locally since no restrictive robots.txt found
  ## (5 secs is still very friendly)
  SLEEP_TIME <- 5

  ## if previous request found a cached item,
  ## we can increase the request rate a bit.
  SLEEP_TIME_ON_CACHE_HIT <- 2

  #' Determine "type" GO term
  #'
  #' @param termValue GO value of a term
  #'
  #' @return "type" of the term: "molecularFunction",
  #'         "biologicalProcess" or "subCellularLocation"
  GOTermType <- function(termValue) {
    type <- if (startsWith(termValue, "F:")) {
      "molecularFunction"
    } else if (startsWith(termValue, "C:")) {
      "subCellularLocation"
    } else if (startsWith(termValue, "P:")) {
      "biologicalProcess"
    } else {
      stop(paste0("Unexpected term prefix: ", termValue))
    }
    return(type)
  }

  #' Internal helper function
  #'
  #' @param xmlResult UniProt data as XML
  #'
  #' @return Extracted and selected data items.
  unwrapXMLData <- function(xmlResult) {
    root <- xml2::xml_root(result$document)

    ## STRING and GO ids
    ## NOTE: dbReference vs dbreference ???
    xpath <- paste0(

      ## GO functional ("F:") references
      "//dbreference[@type='GO']",
        "//property[@type='term'][starts-with(@value, 'F:')]",
        "/parent::node()", ## select parent node of this GO property

      ## GO cellular components ("C:") references
      " | //dbreference[@type='GO']",
        "//property[@type='term'][starts-with(@value, 'C:')]",
        "/parent::node()", ## select parent node of this GO property

      ## GO biological process ("P:") references
      " | //dbreference[@type='GO']",
        "//property[@type='term'][starts-with(@value, 'P:')]",
        "/parent::node()", ## select parent node of this GO property

      ## STRING, Reactome or KEGG references
      " | //dbreference[@type='STRING' or @type='Reactome' or @type='KEGG']"
    )
    dbReferences <- xml2::xml_find_all(x = root, xpath = xpath)

    ## iterate on DB references
    resultList <- list(
      molecularFunction = NULL,
      subCellularLocation = NULL,
      Reactome = list()
    )
    for (dbref in dbReferences) {
      type <- xml2::xml_attr(dbref, "type")
      referenceId <- xml2::xml_attr(dbref, "id")
      if (type == "STRING") {
        ## TODO: if there are multiple ids, the last one "wins"
        resultList[["STRING"]] <- referenceId
      } else if (type == "KEGG") {
        resultList[["KEGG"]] <- referenceId
      } else if (type == "Reactome") {
        pathway <-
          xml2::xml_find_first(dbref, ".//property[@type='pathway name']") %>%
          xml2::xml_attr(attr = "value")
        resultList$Reactome[[referenceId]] <- pathway
      } else if (type == "GO") {
        ## get value of the term
        termValue <-
          xml2::xml_find_first(dbref, ".//property[@type='term']") %>%
          xml2::xml_attr(attr = "value")

        ## select item name by value prefix
        termType <- GOTermType(termValue)
        ## create a new named element (as a vector)
        newItem <- c(substring(termValue, 3))
        names(newItem) <- referenceId
        ## append to the vector (to collect values for the same GO "type")
        resultList[[termType]] <- c(resultList[[termType]], newItem)
      }
    }
    return(resultList)
  }

  ## xmlUniProt "body"-----------------------------------------------------
  uniProtIdList <- clueTable %>%
    dplyr::select(UNIPROT_KB_ID) %>%
    dplyr::distinct() %>%
    dplyr::filter(!is.na(UNIPROT_KB_ID)) %>%
    dplyr::pull(UNIPROT_KB_ID)

  ##
  ## iterate on UniProtKB Id list and -------------------------------------
  ## collect/download relevant details
  ##
  filteredXMLData <- list()
  scrapeFromCache <- TRUE
  for (id in uniProtIdList) {
    ## Note: "id" referenced by the glue template below
    sleepTime <- ifelse(scrapeFromCache, SLEEP_TIME_ON_CACHE_HIT, SLEEP_TIME)
    ## load XML "page"
    result <- getPageCached(
      glue::glue(UNIPROT.XML.TEMPL), sleepTime
    )
    filteredXMLData[[id]] <- unwrapXMLData(result)
  }

  ## update the input table and return the result -------------------------
  clueTable <- clueTable %>%
    dplyr::rowwise() %>%
    dplyr::mutate(UniProtData = filteredXMLData[UNIPROT_KB_ID])
  return(clueTable)
}


getCompoundList <- function(clueTable) {
  compoundList <- clueTable %>%
    select(pert_iname) %>%
    distinct() %>%
    filter(!is.na(pert_iname)) %>%
    pull(1)
  return(compoundList)
}

## Additional counters for KEGG pathways and STRING neighbours -----------

#' keggPathWayCounter
#'
#' Add count of KEGG pathways per target.
#'
#' @param clueTable Input dataframe with UNIPROT_KB_ID column.
#'
#' @return updated clueTable
keggPathWayCounter <- function(clueTable) {

  ## function to get data in a customized way: get plain text file via HTTP
  textGET <- function(url) {
    # if it is a path of a cache file
    if (stringr::str_starts(url, CACHE)) {
      result <- readr::read_file(url)
    } else {
      result <- httr::GET(url)
      result <- httr::content(result, as = 'text')
    }
    return(result)
  }

  ## build a local cache of downloaded data ----

  uniProtList <- clueTable %>%
    select(UniProtData) %>% distinct() %>% pull(1)
  keggList <- sapply(uniProtList, simplify = FALSE, function(uniProtData) {
    retVal <- list(document = NA)
    if (hasName(uniProtData, 'KEGG')) {
      keggId <- uniProtData$KEGG
      url <- glue::glue('http://rest.kegg.jp/get/{keggId}') # TODO: expose it
      retVal <- getPageCached(url, downloadFunc = textGET)
    }
    return(retVal)
  })

  ## count pathways in KEGG result list ----
  keggRegex <- stringr::regex(
    "^PATHWAY.+?^(NETWORK|DISEASE|DRUG_TARGET|BRITE)",
    multiline = TRUE, dotall = TRUE
  )
  keggList <- sapply(keggList, simplify = FALSE, function(keggResult) {
    keggText <- keggResult[["document"]]
    ## extract text block, then count number of lines
    numOfPathwayEntries <- stringr::str_extract(keggText, keggRegex) %>%
      stringr::str_count("\n")
    numOfPathwayEntries <- ifelse(
      is.na(numOfPathwayEntries), 0, numOfPathwayEntries
    )
    return(numOfPathwayEntries)
  })

  ## update the input table and return the result -------------------------
  clueTable <- clueTable %>%
    dplyr::rowwise() %>%
    dplyr::mutate(NumberOfKEGGPathways = keggList[[UNIPROT_KB_ID]])

  return(clueTable)
}


stringInteractorsCounter <- function(clueTable) {

  ## function to get data in a customized way: get plain text file via HTTP
  tsvGET <- function(url) {
    # if it is a path of a cache file
    if (stringr::str_starts(url, pattern = CACHE)) {
      result <- readr::read_tsv(url)
    } else {
      result <- httr::GET(url)
      result <- httr::content(result)
    }
    return(result)
  }

  ## build a local cache of downloaded data ----
  uniProtList <- clueTable %>% select(UniProtData) %>% distinct() %>% pull(1)
  interactorsCache <- sapply(uniProtList, simplify = FALSE, function(uniProtData) {
    retVal <- list(document = NA)
    if (hasName(uniProtData, 'STRING')) {
      STRING_ID <- uniProtData$STRING
      url <- glue::glue('https://string-db.org/api/tsv/interaction_partners?identifiers={STRING_ID}&species=9606&limit=0&required_score=900')
      retVal <- getPageCached(url, downloadFunc = tsvGET)
    }
    return(retVal)
  })
  interactorsCache <- sapply(interactorsCache, simplify = FALSE, function(stringResult) {
    ## get the tibble
    stringTable <- stringResult[["document"]]
    interactorsCount <- ifelse(tibble::is_tibble(stringTable), nrow(stringTable), 0)
    return(interactorsCount)
  })

  ## update the input table and return the result -------------------------
  clueTable <- clueTable %>%
    dplyr::rowwise() %>%
    dplyr::mutate(NumberOfSTRINGInteractors = interactorsCache[[UNIPROT_KB_ID]])

  return(clueTable)
}


##
## Web/Networking ---------------------------------------------------------
##

#' Load and cache files
#'
#' @param url URL pointing to XML/HTML file.
#' @param sleepTime sleep interval in seconds before download,
#' if content is not cached. Default value is 3.
#'
#' @return XML representation of the file content.
#' @import dplyr
getPageCached <- function(url, sleepTime = 3, downloadFunc = rvest::read_html) {
  cacheFile <- glue::glue(CACHE, "/cache.tsv")
  ## initialize tibble object
  cache <- if (file.exists(cacheFile)) {
    readr::read_tsv(cacheFile, col_types = "ccT")
  } else {
    dir.create(CACHE, recursive = TRUE)
    tibble::new_tibble(
      list(URL = "INIT", cache_object = "_", time = Sys.time()),
      nrow = 1
    )
  }

  # load from cache object
  {
    fileName <- cache %>%
      filter(URL == url) %>%
      select(cache_object)

    fileName <- if (nrow(fileName) == 1) {
      fileName %>% pull(cache_object)
    } else {
      character(0)
    }
    notFound <- length(fileName) == 0
    if(notFound) {
      now <- Sys.time()
      fileName <- glue::glue(
        "{getRandomNumber()}-{as.integer(now)}"
      )
    }

    path <- glue::glue("{CACHE}/{fileName}")
    if (!notFound && file.exists(path)) {
      print(glue::glue("{Sys.time()} :: from cache: {url}"))
      # returning from cache
      return(list(
        document = downloadFunc(path),
        fromCache = TRUE
      ))
    } else if (!notFound && !file.exists(path)) {
      stop(glue::glue("file not found: {path} !!!!!! url: {url}"))
    } else {
      # if file doesn't exists
      # ...sleep
      if (sleepTime > 0) {
        print(glue::glue("# sleeping {sleepTime} secs"))
        Sys.sleep(sleepTime)
      }
      # ...download
      resultPage <- downloadFunc(url)
      print(glue::glue("{Sys.time()} :: downloaded: {url}"))
      # ...and save
      if(tibble::is_tibble(resultPage)) {
        readr::write_tsv(resultPage, file = path)
      } else {
        readr::write_file(toString(resultPage), file = path)
      }
      # ...update cache
      {
        newRow <- tibble::tibble_row(
          URL = url,
          cache_object = fileName,
          time = now
        )
        cache <- cache %>%
          tibble::add_row(newRow)
      }
      # ...save updated cache
      readr::write_tsv(x = cache, file = cacheFile)
      # returning recently downloaded page
      return(list(
        document = resultPage,
        fromCache = FALSE
      ))
    }
  }
}


#' Random number generator
#'
#' Generates random integers between 1 and 10000.
#'
#' @return Random integer number.
getRandomNumber <- function() {
  numbers <- runif(10, min = 1, max = 10000)
  return(
    floor(sample(numbers, 1))
  )
}

#' Create an "anchor" element
#'
#' @param link Parameter of "href" attribute.
#' @param titleText character content of "a" element.
#'
#' @return HTML "a" snippet that can be used in HTML document directly.
aHref <- function(link, titleText) {
  return(glue::glue('<a href="{link}" target="_blank">{titleText}</a>'))
}
cycle20/scancer documentation built on Jan. 10, 2023, 11:21 a.m.