Nothing
#' Get Wikidata Item ID
#'
#' Search www.wikidata.org for wikidata item identifiers. Note that this search
#' is currently not limited to chemical substances, so be sure to check your
#' results.
#'
#' @param query character; The searchterm
#' @param language character; the language to search in
#' @param match character; How should multiple hits be handeled? 'all' returns
#' all matched IDs, 'first' only the first match, 'best' the best matching (by
#' name) ID, 'ask' is a interactive mode and the user is asked for input, na'
#' returns NA if multiple hits are found.
#' @param verbose logical; print message during processing to console?
#'
#' @return if match = 'all' a list with ids, otherwise a dataframe with 4 columns:
#' id, matched text, string distance to match and the queried string
#'
#' @note Only matches in labels are returned.
#'
#' @import jsonlite httr
#' @importFrom utils URLencode URLdecode
#' @importFrom purrr map_df
#' @export
#' @examples
#' \dontrun{
#' get_wdid('Triclosan', language = 'de')
#' get_wdid('DDT')
#' get_wdid('DDT', match = 'all')
#'
#' # multiple inputs
#' comps <- c('Triclosan', 'Glyphosate')
#' get_wdid(comps)
#' }
get_wdid <-
function(query,
match = c('best', 'first', 'all', 'ask', 'na'),
verbose = getOption("verbose"),
language = 'en') {
if (!ping_service("wd")) stop(webchem_message("service_down"))
# language <- 'en'
# query <- 'Triclosan'
match <- match.arg(match)
foo <- function(query, language, match, verbose){
if (is.na(query)){
if (verbose) webchem_message("na")
id <- NA
matched_sub <- NA
} else {
query1 <- URLencode(query, reserved = TRUE)
limit <- 50
qurl <-
paste0("wikidata.org/w/api.php?action=wbsearchentities&format=json&type=item")
qurl <- paste0(qurl, "&language=", language, "&limit=", limit, "&search=", query1)
if (verbose) webchem_message("query", query, appendLF = FALSE)
webchem_sleep(type = 'API')
res <- try(httr::RETRY("GET",
qurl,
httr::user_agent(webchem_url()),
terminate_on = 404,
quiet = TRUE), silent = TRUE)
if (inherits(res, "try-error")) {
if (verbose) webchem_message("service_down")
return(tibble::tibble(query = query, match = NA, wdid = NA))
}
if (verbose) message(httr::message_for_status(res))
if (res$status_code == 200) {
cont <- jsonlite::fromJSON(httr::content(res,
type = "text",
encoding = "utf-8"))
search <- cont$search
if (length(search) == 0) {
if (verbose) webchem_message("not_found")
id <- NA
matched_sub <- NA
} else {
# use only matches on label
search <- search[search$match$type %in% c('label', 'alias'), ]
# # check matches
search <- search[tolower(iconv(search$match$text,
"latin1",
"ASCII",
sub = "")) == tolower(query), ]
id <-
matcher(
search$id,
query = query,
result = search$label,
match = match,
# from = from,
verbose = verbose
)
matched_sub <- names(id)
}
}
else {
id <- NA
matched_sub <- NA
}
}
out <- tibble(query = query, match = matched_sub, wdid = id)
return(out)
}
out <-
purrr::map_df(query,
~ foo(
query = .x,
match = match,
language = language,
verbose = verbose
))
return(out)
}
#! Use SPARQL to search of chemical compounds (P31)?! For a finer / better search?
#' Retrieve identifiers from Wikidata
#'
#' @import jsonlite
#' @import httr
#'
#' @param id character; identifier, as returned by \code{\link{get_wdid}}
#' @param verbose logical; print message during processing to console?
#'
#' @return A data.frame of identifiers. Currently these are 'smiles', 'cas', 'cid', 'einecs', 'csid', 'inchi', 'inchikey',
#' 'drugbank', 'zvg', 'chebi', 'chembl', 'unii', 'lipidmaps', 'swisslipids' and source_url.
#'
#' @note Only matches in labels are returned. If more than one unique hit is found,
#' only the first is returned.
#'
#' @seealso \code{\link{get_wdid}}
#'
#' @references Willighagen, E., 2015. Getting CAS registry numbers out of WikiData. The Winnower.
#' \doi{10.15200/winn.142867.72538}
#'
#' Mitraka, Elvira, Andra Waagmeester, Sebastian Burgstaller-Muehlbacher, et al. 2015
#' Wikidata: A Platform for Data Integration and Dissemination for the Life Sciences and beyond. bioRxiv: 031971.
#'
#' @export
#' @examples
#' \dontrun{
#' id <- c("Q408646", "Q18216")
#' wd_ident(id)
#' }
wd_ident <- function(id, verbose = getOption("verbose")){
if (!ping_service("wd")) stop(webchem_message("service_down"))
# id <- c( "Q163648", "Q18216")
# id <- 'Q408646'
foo <- function(id, verbose){
empty <- as.list(rep(NA, 15))
names(empty) <- c("smiles", "cas", "cid", "einecs", "csid", "inchi",
"inchikey", "drugbank", "zvg", "chebi", "chembl", "unii",
'lipidmaps', 'swisslipids', "source_url")
if (is.na(id)) {
if (verbose) webchem_message("na")
return(empty)
}
baseurl <- 'https://query.wikidata.org/sparql?format=json&query='
props <- c('P233', 'P231', 'P662', 'P232', 'P661', 'P234', 'P235', 'P715', 'P679',
'P683', 'P592', 'P652', 'P2063', 'P8691')
names <- c('smiles', 'cas', 'cid', 'einecs', 'csid', 'inchi', 'inchikey',
'drugbank', 'zvg', 'chebi', 'chembl', 'unii', 'lipidmaps', 'swisslipids')
sparql_head <- paste('PREFIX wd: <http://www.wikidata.org/entity/>',
'PREFIX wdt: <http://www.wikidata.org/prop/direct/>',
'SELECT * WHERE {')
sparql_body <- paste(paste0('OPTIONAL{wd:', id, ' wdt:', props, ' ?', names, ' .}'),
collapse = ' ')
sparql <- paste(sparql_head, sparql_body, '}')
qurl <- paste0(baseurl, sparql)
qurl <- URLencode(qurl)
webchem_sleep(type = 'API')
if (verbose) webchem_message("query", id, appendLF = FALSE)
res <- try(httr::RETRY("GET",
qurl,
httr::user_agent(webchem_url()),
terminate_on = 404,
quiet = TRUE), silent = TRUE)
if (inherits(res, "try-error")) {
if (verbose) webchem_message("service_down")
return(empty)
}
if (verbose) message(httr::message_for_status(res))
if (res$status_code == 200) {
tmp <- fromJSON(content(res, as = "text"))
vars_out <- tmp$head$vars
out <- tmp$results$bindings
if (length(out) == 0) {
if (verbose) webchem_message("not_found")
out <- as.list(rep(NA, 15))
names(out) <- c(vars_out, 'source_url')
return(out)
}
if (nrow(out) > 1) {
message("More then one unique entry found! Returning first.")
out <- out[1, ]
}
out <- lapply(out, '[[', 'value')
# check for missing entries and add to out-list
miss <- names[!names %in% names(out)]
for (i in miss) {
out[[i]] <- NA
}
out <- out[names]
out[['source_url']] <- paste0('https://www.wikidata.org/wiki/', id)
out <- unlist(out)
return(out)
}
else {
return(empty)
}
}
# ugly fixing to return data.frame
out <- data.frame(t(sapply(id, foo,verbose = verbose)), stringsAsFactors = FALSE, row.names = seq_along(id))
out[['query']] <- id
# even more ugly...
out <- data.frame(t(apply(out, 1, unlist)), stringsAsFactors = FALSE)
class(out) <- c('wd_ident', 'data.frame')
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.