R/wiki_utils.R

Defines functions v_allinfo v_wikipedias v_sourceId v_sources v_coauthors v_occupations v_dates v_gender v_titles v_Extract v_GetRecord v_Search v_AutoSuggest m_XtoolsInfoAll m_XtoolsInfoDefault m_XtoolsInfo m_Pageviews httrGetJSON m_PageFiles m_PagePrimaryImage m_Redirects m_WikidataEntity m_reqMediaWiki m_Opensearch checkTitles normalizedTitle reqMediaWiki w_EntityInfo w_SearchByLabel w_SearchByInstanceof w_SearchByAuthority w_SearchByIdentifiers w_SearchByOccupation w_LabelDesc w_Geoloc w_Property w_isValid w_Wikipedias w_isInstanceOf w_query reqWDQS checkEntities doChunks limitRequester

Documented in checkEntities checkTitles doChunks httrGetJSON limitRequester m_Opensearch m_Pageviews m_reqMediaWiki m_XtoolsInfo normalizedTitle reqMediaWiki reqWDQS v_AutoSuggest v_Extract v_GetRecord v_Search w_EntityInfo w_Geoloc w_isInstanceOf w_isValid w_LabelDesc w_Property w_query w_SearchByAuthority w_SearchByIdentifiers w_SearchByInstanceof w_SearchByLabel w_SearchByOccupation w_Wikipedias

#### wiki_utils.R
#### Angel Zazo <angelzazo@usal.es>
#### ver. 0.0 2021-11-09
#### ver. 0.1 2023-05-12
#### ver. 0.2 2023-07-27
#### ver. 0.3 2023-09-19
#### ver. 1.2.5 2024-04-18

# General user_agent header for Wikimedia, Wikidata, MediaWiki and VIAF requests ----
#' See https://meta.wikimedia.org/wiki/User-Agent_policy
#'     https://www.mediawiki.org/wiki/API:Etiquette
user_agent <- paste('wikiTools Package, ', R.version.string)

# In MediaWiki API specifying titles through titles or pageids in the query API
# is limited to 50 titles per query, or 500 for those with the "apihighlimits" right.
# See https://www.mediawiki.org/wiki/API:Query#Additional_notes.
MW_LIMIT <- 50

# VIAF API restriction is 250 maximum returned records.
# See https://developer.api.oclc.org/viaf-api#/Authority%20Cluster
VIAF_LIMIT <- 250

#' Limits the rate at which a function will execute
#'
#' @param f The original function
#' @param n Number of allowed events within a period
#' @param period Length (in seconds) of measurement period
#' @return If 'f' is a single function, then a new function with the same
#' signature and (eventual) behavior as the original function, but rate limited.
#' If 'f' is a named list of functions, then a new list of functions with the
#' same names and signatures, but collectively bound by a shared rate limit.
#' Used only for WikiData Query Service (WDQS).
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @seealso ratelimitr
#' @importFrom ratelimitr limit_rate rate
limitRequester <- function(f, n, period) {
  return(ratelimitr::limit_rate(f, ratelimitr::rate(n = n, period = period)))
}

#' Execute a function in chunks.
#'
#' Execute the function f(x,...) in chunks of chunk-size elements each.
#' Wikidata and Wikimedia API have limits to execute a query. Wikidata has
#' timeout limits, Wikimedia about the number of titles or pageIds. This function
#' executes sequentially the function `f` over chunks of elements to prevent
#' errors.
#' @param f The function to execute.
#' @param x Vector of entities or titles/pageids.
#' @param chunksize The number of elements in `x` to execute the function.
#' @param ... The `f` arguments.
#' @return The results of execute `f` using all values of `x`
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
doChunks <- function(f, x, chunksize, ...){
  l <- list(...)  # to obtain `debug` parameter
  n <- length(x)
  nlim <- as.integer(n/chunksize)
  output <- NULL
  for (k in 0:nlim){
    offset <- k*chunksize
    if ((offset+1) > n)
      break
    x_vect <- x[(offset+1):min(n,offset+chunksize)]
    if (l$debug !=FALSE)
      cat(paste0("  INFO: Executing the function on elements from ", offset+1,
                 " to ", offset+length(x_vect)),"\n", file=stderr())
    d <- f(x_vect, nlimit=chunksize, ...)
    if (k==0)
      output <- d
    else
      output <- rbind(output, d)
  }
  return(output)
}

#' Check if all Wikidata entities in entity_list have valid values
#'
#' Return a vector of entities with duplicates or void entities removed. A valid
#' entity is a wikibase item (Qxxx, x is a digit) or a wikibase property (Pxxx).
#' @param entity_list A vector with the Wikidata entities.
#' @return The list of entities or raise an error.
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
checkEntities <- function(entity_list) {
  # Trim and remove ''
  entity_list <- trimws(entity_list)
  entity_list <- entity_list[!entity_list=='']
  p <- grepl("^(?:Q|P)\\d+$", entity_list)
  if (!all(p)) {
    stop(paste0("ERROR: Detected incorrect entities: ", paste0(entity_list[!p], collapse = ', ')))
  }
  return(unique(entity_list))
}

# WikiData Query Service ----
# See https://query.wikidata.org/
# See https://www.wikidata.org/wiki/Wikidata:SPARQL_tutorial
# See https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual
# --------------------------------------------------------------------.

#' Get responses from Wikidata Query Service
#'
#' Retrieve responses from Wikidata Query Service (WDQS)
#' @param sparql_query A string with the query in SPARQL language (SELECT query).
#' @param format A string with the query response format, mandatory. See
#' https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#SPARQL_endpoint.
#' Only  'json', 'xml' or 'csv' formats are allowed, default 'json'.
#' @param method The method used in the httr request, GET or POST, mandatory.
#' Default 'GET'. Use 'POST' method for long SELECT clauses.
#' @return The response in the format selected. Please check httr::stop_for_status(response)
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @importFrom httr GET POST user_agent add_headers
#' @note For short queries GET method is better, POST for long ones. Only GET queries as cached.
reqWDQS <- function(sparql_query, format='json', method='GET') {
  if (format=='json')      wdqs_format <- "application/sparql-results+json"
  else if (format=='xml')  wdqs_format <- "application/sparql-results+xml"
  else if (format=='csv')  wdqs_format <- "text/csv"
  else stop(paste0("ERROR: format '", format, "' is not supported."))

  url <- 'https://query.wikidata.org/sparql'
  if(!curl::has_internet() || httr::http_error(url)){
    message("No internet connection or data source broken.")
    return(NULL)
  }

  if (method=='GET')
    httr::GET(
      url = url,
      query = list(query = sparql_query),
      httr::user_agent(user_agent),
      httr::add_headers(Accept = wdqs_format)
    )
  else if (method=='POST')
    httr::POST(
      url = url,
      body = list(query = sparql_query),
      httr::add_headers(Accept = wdqs_format, user_agent = user_agent),
      encode = 'form'
    )
  else stop(paste0("ERROR: method '", method, "' is not supported."))
}

#' Response from Wikidata Query Service
#'
#' Retrieve responses from Wikidata Query Service (WDQS). Uses ratelimitr if
#' param limitRequester = TRUE.
#' @param sparql_query A string with the query in SPARQL language.
#' @param format A string with the query response format. Mandatory.
#' See https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#SPARQL_endpoint.
#' Only  'json', 'xml' or 'csv' formats are allowed, default 'csv'.
#' @param method The method used in the httr request, GET or POST, mandatory.
#' Default 'GET'.
#' @param limitRequester If True, uses ratelimitr to limit the requests.
#' @return The response in selected format or NULL on errors.
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @importFrom httr stop_for_status content
#' @importFrom jsonlite fromJSON
#' @importFrom utils read.csv
w_query <- function(sparql_query, format="csv", method="GET",
                    limitRequester=FALSE) {
  # reqWDQS_rated: The ratelimitr version of reqWDQS
  # https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#SPARQL_endpoint
  if (limitRequester)
    reqWDQS_rated <- limitRequester(reqWDQS, n=60, period=60)
  else
    reqWDQS_rated <- reqWDQS
  #
  tryCatch(
    {
      r <- reqWDQS_rated(sparql_query, format=format, method=method)
      if(is.null(r)){
        return(NULL)
      }
      httr::stop_for_status(r)
      content <- httr::content(r, as = "text", encoding = "UTF-8")
      rtype <- httr::http_type(r)
      if (format == 'json' && startsWith(rtype, "application/sparql-results+json"))
        output <- jsonlite::fromJSON(content, simplifyVector = FALSE)
      else if (format == 'xml' && startsWith(rtype, "application/sparql-results+xml"))
        output <- content
      else if (format == 'csv' && startsWith(rtype, "text/csv"))
        output <- read.csv(text = content)
      else
        stop(paste0("ERROR: format '", format, "' or response type '",rtype,"' is incorrect."))
      return(output)
    }, error = function(e){
      cat(as.character(e), file=stderr())
      return(NULL)
    }
  )
}

#' Check if a Wikidata entity is an instance of a class
#'
#' Check using WDQS if the Wikidata entities in entity_list are instances of
#' `instanceof` Wikidata entity class. For example, if instanceof="Q5", check if
#' entities are instances of the Wikidata entity class Q5, i.e, are humans.
#' Some entity classes are allowed, separately by '|'; in this case, the OR
#' operator is considered. If instanceof='' then no filter is applied: the
#' function returns all Wikidata entities class of which each of the entities in
#' the list are instances.
#' Duplicated entities are deleted before search.
#' Note that no labels or descriptions of the entities are returned. Please, use
#' function `w_LabelDesc` for this.
#'
#' @param entity_list A vector with the Wikidata entities.
#' @param instanceof The Wikidata class to check, mandatory. Some entity classes
#' separated by '|' are allowed, in this case, the OR operator is considered.
#' @param nlimit If the number of entities exceeds this number, chunked queries
#' are done. This is the number of entities requested in each chunk. Please,
#' reduce the default value if error is raised.
#' @param debug For debugging purposes (default FALSE). If debug='info'
#' information about chunked queries is shown. If debug='query' also the query
#' launched is shown.
#' @return A data-frame with three columns, first Wikidata entity, second all
#' Wikidata class each instance is instance of them, last TRUE or FALSE if each
#' entity is instance of the `instanceof` parameter, if this one is set.
#' @examples
#' \dontrun{
#' # aux: get a vector of entities (l).
#' df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', mode='inlabel')
#' l <- df$entity
#'
#' df <- w_isInstanceOf(entity_list=l, instanceof='Q5')
#' # Not TRUE
#' df[!df$instanceof_Q5,]
#' }
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @export
w_isInstanceOf <- function(entity_list, instanceof='', nlimit=50000, debug=FALSE) {
  # check entities
  entity_list <- checkEntities(entity_list)
  #
  n <- length(entity_list)
  # Check limits to make chucked queries
  if (n > nlimit) {
    if (debug != FALSE)
      cat(paste0("INFO: The number of entities (", n,") exceeds nlimit (",
                 nlimit,')\n'), file=stderr())
    return (doChunks(w_isInstanceOf, entity_list, nlimit, instanceof=instanceof,
                     debug=debug))
  }
  #
  values <- paste0("wd:", paste0(entity_list, collapse = ' wd:'))
  #
  query <- paste0('SELECT ?entity
(GROUP_CONCAT(DISTINCT ?instanc;separator="|") as ?instanceof)
WHERE {
  OPTIONAL {
    VALUES ?entity {', values,'}
    OPTIONAL {?entity wdt:P31 ?instanc.}
  }
} GROUP BY ?entity\n')
  #
  if (debug == "query")
    cat(query, file=stderr())
  #
  r <- w_query(query, format='csv', method='POST')
  if(is.null(r)){
    return(NULL)
  }
  r$entity <- sub('^http://www.wikidata.org/entity/', '', r$entity)
  r$instanceof <- gsub('http://www.wikidata.org/entity/', '', r$instanceof)
  #
  # Add new column for instanceof parameter:
  if (instanceof!='')
    r[[paste0('instanceof_', instanceof)]] = grepl(paste0('\\b(?:', instanceof, ")\\b"), r$instanceof)
  #
  rownames(r) <- r$entity
  return(r)
}

#' Get Wikipedia pages of Wikidata entities
#'
#' Get from Wikidata all Wikipedia page titles and URL of the Wikidata entities
#' in entity_list. If parameter `wikilangs`='', then returns all Wikipedia page
#' titles, else only the languages in `wikilangs`. The returned dataframe also
#' includes the Wikidata entity classes of which the searched entity is
#' an instance. If set the parameter `instanceof`, then only returns the pages
#' for Wikidata entities which are instances of the Wikidata class indicated in
#' it. The data-frame doesn't return labels or descriptions about entities: the
#' function `w_LabelDesc` can be used for this. Duplicated entities are deleted
#' before search. Index of the data-frame returned are also set to entity_list.
#' @param entity_list A vector of Wikidata entities.
#' @param wikilangs List of languages to limit the search, using "|" as
#' separator. Wikipedias page titles are returned in same order as languages in
#' this parameter. If wikilangs='' the function returns Wikipedia page titles
#' in any language, not sorted.
#' @param instanceof Wikidata entity class to limit the result to the instances
#' of that class. For example, if instanceof='Q5', limit the results to "human".
#' @param nlimit If the number of entities exceeds this number, chunked queries
#' are done. This is the number of entities requested in each chunk. Please,
#' reduce the default value if error is raised.
#' @param debug For debugging purposes (default FALSE). If debug='info'
#' information about chunked queries is shown. If debug='query' also the query
#' launched is shown.
#' @return A data-frame with five columns: entities, instanceof, npages, page
#' titles and page URLs. Last three use "|" as separator. Index of data-frame is
#' also set to the entity_list.
#' @examples
#' \dontrun{
#' # aux: get a vector of entities (l).
#' df <- w_SearchByLabel(string='Napoleon', langsorder='en', mode='inlabel')
#' l <- df$entity  # aprox. 3600
#'
#' w <- w_Wikipedias(entity_list=l, debug='info')
#' w <- w_Wikipedias(entity_list=l, wikilangs='es|en|fr', debug='info')
#' # Filter instanceof=Q5 (human):
#' w_Q5 <- w[grepl("\\bQ5\\b", w$instanceof), ]
#' w_Q5b <- w_Wikipedias(entity_list=l, wikilangs='es|en|fr', instanceof='Q5', debug='info')
#' }
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @export
w_Wikipedias <- function(entity_list, wikilangs="", instanceof='', nlimit=1500, debug=FALSE) {
  # check entities
  entity_list <- checkEntities(entity_list)
  #
  n <- length(entity_list)
  # Check limits to make chucked queries
  if (n > nlimit) {
    if (debug != FALSE)
      cat(paste0("INFO: The number of entities (", n,") exceeds nlimit (",
                 nlimit,')\n'), file=stderr())
    return (doChunks(w_Wikipedias, entity_list, nlimit, wikilangs=wikilangs,
                     instanceof=instanceof, debug=debug))
  }
  #
  values <- paste0("wd:", paste0(entity_list, collapse = ' wd:'))
  #
  wikilangs <- trimws(wikilangs)
  if (wikilangs=="")
    w_filter <- ''
  else {
    wikiorder <- strsplit(wikilangs, '|', fixed = T)[[1]]
    w_filter  <- paste0("FILTER(?lang IN ('",gsub('|', "','", wikilangs, fixed=T), "'))")
  }
  #
  query <- paste0("SELECT DISTINCT ?entity
(GROUP_CONCAT(DISTINCT ?instanc;separator=\"|\") as ?instanceof)
(COUNT(DISTINCT ?page) as ?count)
(GROUP_CONCAT(DISTINCT ?lang;separator=\"|\") as ?langs)
(GROUP_CONCAT(DISTINCT ?name;separator=\"|\") as ?names)
(GROUP_CONCAT(DISTINCT ?page;separator=\"|\") as ?pages)
WHERE {
  OPTIONAL {
    VALUES ?entity {", values, "}
    OPTIONAL {?entity wdt:P31 ?instanc.}
    OPTIONAL {
      ?page schema:about ?entity;
            schema:inLanguage ?lang;
            schema:name ?name;
            schema:isPartOf [wikibase:wikiGroup \"wikipedia\"].\n",
                  w_filter,"
    }
  }
} GROUP BY ?entity\n")
  #
  if (debug == "query")
    cat(query, file=stderr())
  #
  r <- w_query(query, format='csv', method='POST')
  if(is.null(r)){
    return(NULL)
  }
  r$entity <- sub('^http://www.wikidata.org/entity/', '', r$entity)
  r$instanceof <- gsub('http://www.wikidata.org/entity/', '', r$instanceof)
  # Filter instanceof
  if (instanceof!='')
    r <- r[grepl(paste0('\\b(?:', instanceof, ")\\b"), r$instanceof), ]
  # If all entities are excluded:
  if (nrow(r) == 0)
    return(r)
  #
  r$count <- as.integer(r$count)
  rownames(r) <- r$entity
  # Order by wikilangs
  if (wikilangs!='') {
    for (entity in r$entity) {
      if (r[entity, 'count'] > 1) {
        l <- strsplit(r[entity, 'langs'], '|', fixed = T)[[1]]
        o <- match(wikiorder, l)
        o <- o[!is.na(o)]
        r[entity, "langs"] <- paste0(l[o], collapse = '|')
        l <- strsplit(r[entity, 'names'], '|', fixed = T)[[1]]
        r[entity, "names"] <- paste0(l[o], collapse = '|')
        l <- strsplit(r[entity, 'pages'], '|', fixed = T)[[1]]
        r[entity, "pages"] <- paste0(l[o], collapse = '|')
      }
    }
  }
  # Create a NA data-frame and fill it with retrieved values
  q <- data.frame(entity=r$entity, instanceof=NA, count=NA, langs=NA, names=NA, pages=NA)
  rownames(q) <- q$entity
  q[r$entity, ] <- r
  return(q)
}

#' Check if Wikidata entities are valid
#'
#' A entity is valid if it has a label or has a description. If one entity
#' exists but is not valid, is possible that it has a redirection to other
#' entity, in that case, the redirection is obtained. Other entities may have
#' existed in the past, but have been deleted. The returned dataframe also
#' includes the Wikidata class (another Wikidata entity) of which the searched
#' entity are instances of. The data-frame no contains labels or descriptions
#' about entities: the function `w_LabelDesc` can be used for valid entities.
#' Duplicated entities are deleted before search. Index of the data-frame
#' returned are also set to entity_list.
#'
#' @param entity_list A vector with de Wikidata entities.
#' @param nlimit If the number of entities exceeds this number, chunked queries
#' are done. This is the number of entities requested in each chunk. Please,
#' reduce the default value if error is raised.
#' @param debug For debugging purposes (default FALSE). If debug='info'
#' information about chunked queries is shown. If debug='query' also the query
#' launched is shown.
#' @return A data-frame with four columns: entity, valid (TRUE or FALSE),
#' instanceof and redirection (if the entity redirects to another Wikidata
#' entity, the redirection column contains the last).
#' @examples
#' \dontrun{
#' w_isValid(c("Q9021", "Q115637688", "Q105660123"))
#' # Large list
#' l  <- w_SearchByOccupation(Qoc='Q2306091')
#' l2 <- append(l$entity, c("Q115637688", "Q105660123"))  # Note: adding two new entities
#' v <- w_isValid(l2)
#' # Not valid
#' v[!v$valid, ]
#' }
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @export
w_isValid <- function(entity_list, nlimit=50000, debug=FALSE) {
  # check entities
  entity_list <- checkEntities(entity_list)
  #
  n <- length(entity_list)
  # Check limits to make chucked queries
  if (n > nlimit) {
    if (debug != FALSE)
      cat(paste0("INFO: The number of entities (", n,") exceeds nlimit (",
                 nlimit,')\n'), file=stderr())
    return (doChunks(w_isValid, entity_list, nlimit, debug=debug))
  }
  #
  values <- paste0("wd:", paste0(entity_list, collapse = ' wd:'))
  #
  query <- paste0("SELECT ?entity ?valid
(GROUP_CONCAT(DISTINCT ?instanc; separator='|') as ?instanceof) ?redirection
WHERE {
  OPTIONAL {
   VALUES ?entity {", values, '}
   BIND(EXISTS{?entity rdfs:label []} || EXISTS{?entity schema:description []} AS ?valid).
   OPTIONAL {?entity wdt:P31 ?instanc.}
   OPTIONAL {?entity owl:sameAs ?redirection}
  }
} GROUP BY ?entity ?valid ?redirection\n')
  #
  if (debug == "query")
    cat(query, file=stderr())
  #
  r <- w_query(query, format = "csv", method = 'POST')
  if(is.null(r)){
    return(NULL)
  }
  #
  for (q in c('entity', 'instanceof', 'redirection'))
    r[[q]] <- gsub('http://www.wikidata.org/entity/', '', r[[q]])
  r$valid <- ifelse(r$valid == 'true', T, F)
  # Convert NA -> ""
  r$redirection <- ifelse(is.na(r$redirection),"",r$redirection)
  rownames(r) <- r$entity
  return(r)
}

#' Get properties of Wikidata entities
#'
#' Search the entities of the `entity_list` for property or properties. If
#' searched properties can have more than one language, then the parameter
#' `langsorder` set the order of language used. If parameter `includeQ` is TRUE,
#' also is returned the Wikidata entities for the properties. The Wikidata class
#' of which the entities are instances of are returned too. Duplicated entities
#' are deleted before search. Index of the data-frame is also set to
#' entity_list.
#'
#' @param entity_list A vector with de Wikidata entities.
#' @param Pproperty Wikidata properties to search, separated with '|', mandatory.
#' For example, is Pproperty="P21", the results contain information of the sex
#' of entities. If Pproperty="P21|P569" also searches for birthdate. If
#' Pproperty='P21|P569|P214' also searches for VIAF identifier.
#' @param includeQ If the value is TRUE the function returns the Wikidata entity
#' (Qxxx) of the Pproperty. If also `langsorder` has language(s), the labels, if
#' any, are returned too. Note that includeQ is only effective if `Pproperty`
#' corresponds with a Wikidata entity, else the same values that label are
#' returned.
#' @param langsorder Order of languages in which the information will be
#' returned, separated with '|'. If no information is given in the first
#' language, next is used. This parameter is mandatory if parameter `includeQ`
#' if FALSE. If includeQ=TRUE and langsorder='' no labels are returned.
#' @param nlimit If the number of entities exceeds this number, chunked queries
#' are done. This is the number of entities requested in each chunk. Please,
#' reduce the default value if error is raised.
#' @param debug For debugging purposes (default FALSE). If debug='info'
#' information about chunked queries is shown. If debug='query' also the query
#' launched is shown.
#' @return A data-frame with the entity, the entities of the properties and the
#' labels in langsorder for them.
#' @examples
#' \dontrun{
#' w_Property(c("Q1252859", "Q712609", "Q381800"), Pproperty='P21|P569|P214', langsorder='en|es')
#' # Large list
#' df <- w_SearchByOccupation(Qoc='Q2306091') # ~ 20000
#' l <- df$entity
#' p <- w_Property(l, Pproperty='P21|P569|P214', langsorder='es|en', debug='info')
#' # Get birth-place (P19)
#' p <- w_Property(l, Pproperty='P19', langsorder='es|en', includeQ=TRUE, debug='info')
#' }
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @export
w_Property <- function(entity_list, Pproperty, includeQ=FALSE, langsorder='en',
                       nlimit=10000, debug=FALSE) {
  #
  Pproperty <- trimws(Pproperty)
  langsorder <- trimws(langsorder)
  if (Pproperty == "" || (langsorder == '' && !includeQ))
    stop(paste0("ERROR: one or more parameters 'Pproperty', 'includeQ' or 'langsorder' are incorrect."))
  # check entities
  entity_list <- checkEntities(entity_list)
  #
  n <- length(entity_list)
  # Check limits to make chucked queries
  if (n > nlimit) {
    if (debug != FALSE)
      cat(paste0("INFO: The number of entities (", n,") exceeds nlimit (",
                 nlimit,')\n'), file=stderr())
    return (doChunks(w_Property, entity_list, nlimit, Pproperty=Pproperty,
                     includeQ=includeQ, langsorder=langsorder, debug=debug))
  }
  #
  if (langsorder!='') {
    langsorder <- gsub("|", ",", langsorder, fixed = T)
    searchlang <- paste0('\n   SERVICE wikibase:label {bd:serviceParam wikibase:language "', langsorder, '".')
  }
  #
  group_concat <- character()
  search <- character()
  #
  props <- strsplit(Pproperty, '|', fixed = T)[[1]]
  for (p in props) {
    if (includeQ)
      group_concat <- append(group_concat, paste0("(GROUP_CONCAT(DISTINCT ?", p ,"p;separator='|') as ?",p,")"))
    if (langsorder!='') {
      group_concat <- append(group_concat, paste0("(GROUP_CONCAT(DISTINCT STR(?", p ,"label);separator='|') as ?",p,"Label)"))
      searchlang <- append(searchlang, paste0('      ?', p,'p rdfs:label ?', p, 'label.'))
    }
    search <- append(search, paste0('    OPTIONAL {?entity wdt:', p, ' ?', p, 'p.}'))
  }
  group_concat <- paste0(group_concat, collapse = "\n")
  search       <- paste0(search, collapse = "\n")
  if (langsorder!='')
    searchlang <- paste0(paste0(searchlang, collapse = "\n"), '}')
  else
    searchlang <- ''
  #
  values <- paste0("wd:", paste0(entity_list, collapse = ' wd:'))
  query <- paste0("SELECT DISTINCT ?entity (GROUP_CONCAT(DISTINCT ?instanc; separator='|') as ?instanceof)\n",
                  group_concat,"
WHERE {
  OPTIONAL {
    VALUES ?entity {",values,"}
    OPTIONAL {?entity wdt:P31 ?instanc.}\n",
                  search,
                  searchlang, '
  }
} GROUP BY ?entity\n')
  #
  if (debug == "query")
    cat(query, file=stderr())
  #
  r <- w_query(query, format = "csv", method = 'POST')
  if(is.null(r)){
    return(NULL)
  }
  if (includeQ) {
    for (p in props)
      r[[p]] <- gsub('http://www.wikidata.org/entity/', '', r[[p]])
  }
  r$entity <- gsub('http://www.wikidata.org/entity/', '', r$entity)
  r$instanceof <- gsub('http://www.wikidata.org/entity/', '', r$instanceof)
  rownames(r) <- r$entity
  return(r)
}

#' Get Latitude and Longitude coordinates, and Country of places
#'
#' Get Latitude and Longitude coordinates of the Wikidata entities which are
#' places. Also the countries they belong are returned.
#'
#' @param entity_list A vector with de Wikidata entities (places).
#' @param langsorder Order of languages in which the information will be
#' returned, separated with '|'. If no information is given in the first
#' language, next is used. If langsorder=='', then labels or descriptions
#' are not returned.
#' @param nlimit If the number of entities exceeds this number, chunked queries
#' are done. This is the number of entities requested in each chunk. Please,
#' reduce the default value if error is raised.
#' @param debug For debugging purposes (default FALSE). If debug='info'
#' information about chunked queries is shown. If debug='query' also the query
#' launched is shown.
#' @return A data-frame with 'entity', label, Latitude and Longitude, country
#' and label of the country.
#' @examples
#' \dontrun{
#' w_Geoloc(c("Q57860", "Q90", "Q15695"), langsorder="")
#' w_Geoloc(c("Q57860", "Q90", "Q15695"), langsorder="se") # Note label of place for Q15695
#' w_Geoloc(c("Q57860", "Q90", "Q15695"), langsorder="se|fr")
#' df <- w_SearchByOccupation(Qoc='Q2306091') # aprox. 20000
#' l <- df$entity
#' # Get birth-place (P19)
#' p <- w_Property(l, Pproperty = 'P19', includeQ=TRUE, langsorder='es|en', debug='info')
#' # Filter entities that have places
#' places <- p[grepl("^Q\\d+$", p$P19), ]$P19
#' g <- w_Geoloc(places, langsorder='en|es', debug='info')
#' }
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @export
w_Geoloc <- function(entity_list, langsorder='', nlimit=1000, debug=FALSE) {
  # check entities
  entity_list <- checkEntities(entity_list)
  #
  n <- length(entity_list)
  # Check limits to make chucked queries
  if (n > nlimit) {
    if (debug != FALSE)
      cat(paste0("INFO: The number of entities (", n,") exceeds nlimit (",
                 nlimit,')\n'), file=stderr())
    return (doChunks(w_Geoloc, entity_list, nlimit, langsorder=langsorder, debug=debug))
  }
  #
  langsorder <- trimws(langsorder)
  if (langsorder == '') {
    sss <- '?place (STR(SAMPLE(?clat)) as ?placeLat) (STR(SAMPLE(?clon)) as ?placeLon) ?country'
    sqq <- ''
    sgg <- '?place ?country'
  }
  else {
    sss <- '?place ?placeLabel (STR(SAMPLE(?clat)) as ?placeLat) (STR(SAMPLE(?clon)) as ?placeLon) ?country ?countryLabel'
    langsorder <- gsub("|", ",", langsorder, fixed = T)
    sqq <- paste0('SERVICE wikibase:label {bd:serviceParam wikibase:language "',
                  langsorder,'".\n    ?place rdfs:label ?placeLabel.\n    ?country rdfs:label ?countryLabel.}')
    sgg <- '?place ?placeLabel ?country ?countryLabel'
  }
  #
  values <- paste0("wd:",paste0(entity_list, collapse = ' wd:'))
  query <- paste0("SELECT DISTINCT ",sss, "
WHERE {
  OPTIONAL {
    VALUES ?place {", values, "}
    OPTIONAL {?place wdt:P1366+ ?placelast.}
    OPTIONAL {?place wdt:P625 ?c1.}
    OPTIONAL {?placelast wdt:625 ?c2.}
    BIND(COALESCE(?c1, ?c2) AS ?c).
    BIND(geof:longitude(?c) AS ?clon)
    BIND(geof:latitude(?c)  AS ?clat)
    BIND(COALESCE(?placelast, ?place) AS ?actualplace).
    OPTIONAL {
      ?actualplace wdt:P17 ?country.
      ?country wdt:P31 ?instance.
      FILTER (?instance in (wd:Q3624078, wd:Q7275, wd:Q6256)).
      #not a former country
      # FILTER NOT EXISTS {{?country wdt:P31 wd:Q3024240}}
      #and no an ancient civilisation (needed to exclude ancient Egypt)
      # FILTER NOT EXISTS {{?country wdt:P31 wd:Q28171280}}
      FILTER (?instance not in (wd:Q3024240, wd:Q28171280)).
    }
  }
  ",sqq,"
} GROUP BY ", sgg, "\n")
  #
  if (debug == "query")
    cat(query, file=stderr())
  #
  r <- w_query(query, format='csv', method='POST')
  if(is.null(r)){
    return(NULL)
  }
  r$place   <- sub('^http://www.wikidata.org/entity/', '', r$place)
  r$country <- sub('^http://www.wikidata.org/entity/', '', r$country)
  #
  # Sometimes an ancient/old place corresponds to several present-day places and
  # several countries. Other times the place is a border between two countries.
  # But for a given place, the function should only return one country. There
  # are some approaches that can be applied in Wikidata, but they are not
  # uniform (number of references, time-stamps, etc.). In general, this happens
  # very rarely, so removing duplicates may be a good option.
  r <- r[!duplicated(r$place),]
  rownames(r) <- r$place
  return(r)
}

#' Return label and/or descriptions of Wikidata entities
#'
#' Return label and/or descriptions of the entities in entity_list in language
#' indicated in `langsorder`. Note that entities can be Wikidata entities (Qxxx)
#' or Wikidata properties (Pxxx).
#'
#' @param entity_list A vector with de Wikidata entities.
#' @param what Retrieve only Labels (L), only Descriptions (D) or both (LD).
#' @param langsorder Order of languages in which the information will be
#' returned, separated with '|'. If no information is given in the first
#' language, next is used. This parameter is mandatory, at least one language is
#' required, default 'en'.
#' @param nlimit If the number of entities exceeds this number, chunked queries
#' are done. This is the number of entities requested in each chunk. Please,
#' reduce the default value if error is raised.
#' @param debug For debugging purposes (default FALSE). If debug='info'
#' information about chunked queries is shown. If debug='query' also the query
#' launched is shown.
#' @return A data-frame with one column for the entities, and others for the
#' language and the labels and/or descriptions. The index of the dataframe is
#' also set to the entity list.
#' @examples
#' \dontrun{
#' w_LabelDesc(c("Q57860", "Q712609", "Q381800", "P569"), what='LD', langsorder = 'se|es|en')
#' }
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @export
w_LabelDesc <- function(entity_list, what='LD', langsorder='en', nlimit=25000,
                        debug=FALSE) {
  langsorder <- trimws(langsorder)
  if (langsorder=='' || (!grepl('L', what) && (!grepl('D', what))))
    stop(paste0("ERROR: one or both parameters 'what' or 'langsorder' are incorrect."))
  #
  # check entities
  entity_list <- checkEntities(entity_list)
  #
  n <- length(entity_list)
  # Check limits to make chucked queries
  if (n > nlimit) {
    if (debug != FALSE)
      cat(paste0("INFO: The number of entities (", n,") exceeds nlimit (",
                 nlimit,')\n'), file=stderr())
    return (doChunks(w_LabelDesc, entity_list, nlimit, what=what, langsorder=langsorder, debug=debug))
  }
  #
  ss <- ''
  qs <- ''
  if (grepl('L', what)) {
    ss <- paste0(ss, '(LANG(?label) as ?labellang) ?label')
    qs <- paste0(qs, '    ?entity rdfs:label ?label.\n')
  }
  if (grepl('D', what)) {
    ss <- paste0(ss, ' (LANG(?description) as ?descriptionlang) ?description')
    qs <- paste0(qs, '    ?entity schema:description ?description.\n')
  }
  #
  values <- paste0("wd:",paste0(entity_list, collapse = ' wd:'))
  langsorder <- gsub("|", ",", langsorder, fixed = T)
  #
  query <- paste0('SELECT ?entity ',ss,'
WHERE {
  VALUES ?entity {', values, '}
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "', langsorder, '".\n', qs,
                  '  }
}\n')
  #
  if (debug == "query")
    cat(query, file=stderr())
  #
  r <- w_query(query, format='csv', method='POST')
  if(is.null(r)){
    return(NULL)
  }
  r$entity <- sub('^http://www.wikidata.org/entity/', '', r$entity)
  rownames(r) <- r$entity
  return(r)
}

#' Get Wikidata entities with a certain occupation
#'
#' Return the Wikidata entities which have the occupation indicated in `Qoc`,
#' the Wikidata entity for that occupation. For example, if Qoc='Q2306091',
#' returns the Wikidata entities which occupation is "Sociologist", among
#' others. Also returns the Wikidata class of which the entities are instances
#' of. If parameter langsorder='', then no labels or descriptions of the
#' entities are returned, otherwise the function returns them in the language
#' order indicated in `langsorder`. If wikilangs='' (if mode='wikipedias') then
#' the Wikipedia pages are not filtered by language, else only Wikipedias of
#' languages in this parameter are returned.
#'
#' @param Qoc The Wikidata entity of the occupation. For example, Q2306091 for
#' sociologist, Q2526255 for Film director, etc.
#' @param mode The results you want to obtain: 'default' returns the Wikidata
#' entities which have the occupation indicated; 'count' search in WDQS to know
#' the number of Wikidata entities with that occupation); 'wikipedias' also the
#' Wikipedia page of the entities are returned.
#' @param langsorder Order of languages in which the information will be
#' returned, separated with '|'. If no information is given in the first
#' language, next is used. If langsorder='', then labels or descriptions
#' are not returned.
#' @param wikilangs List of languages in Wikipedias to limit the search, using
#' "|" as separator (only if mode='wikipedias'). Wikipedias page titles are
#' returned in same order as languages in this parameter. If wikilangs='' the
#' function returns Wikipedia page titles of entities in any language, not
#' sorted.
#' @param nlimit If the number of entities in that occupation exceeds this
#' number, then query are made in chunks. The value can increase if
#' `langorder`=''. Please, reduce the default value if error is raised.
#' @param debug For debugging purposes (default FALSE). If debug='info'
#' information about chunked queries is shown. If debug='query' also the query
#' launched is shown. If debug='count' the function only returns the number of
#' entities with that occupation.
#' @return A data-frame with 'entity' and  'entityLabel', 'entityDescription',
#' 'instanceof' and 'instanceofLabel' columns. Index of the data-frame
#' is also set to the list of entities found.
#' @examples
#' \dontrun{
#' # "Q2306091" Qoc for Sociologist
#' w_SearchByOccupation(Qoc="Q2306091", mode='count')
#' q <- w_SearchByOccupation(Qoc="Q2306091", langsorder="")
#' q <- w_SearchByOccupation(Qoc="Q2306091", langsorder="en|es|fr")
#' q <- w_SearchByOccupation(Qoc="Q2306091", mode='wikipedias', debug='info')
#' q <- w_SearchByOccupation(Qoc="Q2306091", mode='wikipedias', wikilangs='en|es|fr', debug='info')
#' }
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @export
w_SearchByOccupation <- function(Qoc, mode=c('default','count','wikipedias'),
                                 langsorder='', wikilangs='', nlimit=10000, debug=FALSE) {
  mode <- mode[1]
  #
  # First: known the number of entities
  query <- paste0('SELECT (COUNT(DISTINCT ?entity) AS ?count) WHERE {?entity wdt:P106 wd:',Qoc,'}')
  d <- w_query(query, method='GET', format="csv")
  if(is.null(d)){
    return(NULL)
  }
  nq <- d$count[1]
  #
  if (debug!=FALSE) {
    cat(paste0("INFO: The number of entities with that occupation is ",nq,".\n"), file=stderr())
  }
  if (mode == 'count')
    return(nq)
  #
  langsorder <- trimws(langsorder)
  if (langsorder == '') {
    ss1 <- ''
    sq  <- ''
    ss2 <- ''
  }
  else {
    ss1 <- "?entityLabel ?entityDescription"
    ss2 <- "(GROUP_CONCAT(DISTINCT ?instancLabel; separator='|') as ?instanceofLabel)"
    langsorder <- gsub("|", ",", langsorder, fixed = T)
    sq = paste0('SERVICE wikibase:label {bd:serviceParam wikibase:language "',langsorder,'".
      ?entity rdfs:label ?entityLabel.
      ?entity schema:description ?entityDescription.
      ?instanc rdfs:label ?instancLabel.}')
  }
  #
  nlim <- as.integer(nq/nlimit)
  if (nlim>0 && debug!=FALSE)
    cat(paste0("INFO: The number of entities (",nq ,") exceeds chunksize (", nlimit, ").\n"), file=stderr())
  output <- NULL
  for (k in 0:nlim){
    offset <- k*nlimit
    if ((offset+1) > nq)
      break
    if (nlim>0 && debug!=FALSE)
      cat(paste0("  INFO: Requesting elements from ", offset+1, " to ",
                 min(offset+nlimit, nq),"\n"), file=stderr())
    #
    query <- paste0("SELECT DISTINCT ?entity ",ss1,"
(GROUP_CONCAT(DISTINCT ?instanc; separator='|') as ?instanceof)
",ss2,"
WITH {
  SELECT DISTINCT ?entity
  WHERE {?entity wdt:P106 wd:",Qoc,".}
  ORDER BY ?entity
  LIMIT ",nlimit," OFFSET ",offset,"
  } AS %results
WHERE {
  INCLUDE %results.
  ",sq,"
  OPTIONAL {?entity wdt:P31 ?instanc.}
} GROUP BY ?entity ", ss1 ,"\n")
    #
    if (debug == "query")
      cat(query, file=stderr())
    #
    r <- w_query(query, format='csv', method='POST')
    if(is.null(r)){
      next
    }
    r$entity  <- gsub('http://www.wikidata.org/entity/', '', r$entity)
    r$instanceof  <- gsub('http://www.wikidata.org/entity/', '', r$instanceof)
    rownames(r) <- r$entity
    if (k==0)
      output <- r
    else
      output <- rbind(output, r)
  }
  #
  if (mode=='wikipedias') {
    if (debug!=FALSE)
      cat("INFO: Searching for Wikipedias.\n", file=stderr())
    w <- w_Wikipedias(output$entity, wikilangs = wikilangs, debug=debug)
    output <- cbind(output, subset(w, select = setdiff(names(w),c("entity","instanceof"))))
  }
  #
  return(output)
}


#' Search for entities that may match identifiers in a database or authorities'
#' catalog.
#'
#' The identifiers are in id_list. The database or authorities' catalog to which
#' these identifiers belong must be provided in parameter `Pauthority`.
#' If parameter langsorder='', then no labels or descriptions of the entities
#' are returned, otherwise the function returns them in the language order
#' indicated in `langsorder`. Duplicated entities are deleted before search.
#' Index of the data-frame returned are also set to id_list.
#'
#' @param id_list List of identifiers.
#' @param Pauthority Wikidata property identifier of the database or
#' authorities' catalog. For example, if Pauthority = "P4439", then the function
#' searches for entities that have the identifiers in the MNCARS (Museo Nacional
#' Centro de Arte Reina Sofía) database. Following library abbreviations for the
#' databases can be also used in the parameter 'Pauthority':
#'
#' library   : VIAF, LC,   BNE , ISNI, JPG,  ULAN, BNF,  GND, DNB,
#' Pauthority: P214, P244, P950, P213, P245, P245, P268, P227,P227,
#'
#' library   : SUDOC, NTA,  J9U,   ELEM,  NUKAT, MNCARS
#' Pauthority: P269, P1006, P8189, P1565, P1207, P4439
#'
#' @param langsorder Order of languages in which the information will be
#' returned, separated with '|'. If no information is given in the first
#' language, next is used. If langsorder='', then labels or descriptions are
#' not returned.
#' @param nlimit If the number of entities in the database or authorities'
#' catalog exceeds this number, then query are made in chunks. The value can
#' increase if langorder=''. Please, reduce the default value if error is raised.
#' @param debug For debugging purposes (default FALSE). If debug='info'
#' information about chunked queries is shown. If debug='query' also the query
#' launched is shown. If debug='count' the function only returns the number of
#' entities with have identifier in that authority.
#' @return A data-frame with columns: 'entity', 'entityLabel', 'entityDescription',
#' 'instanceof', instanceofLabel' and the identifier in the "Pauthority" database.
#' Index of the data-frame is also set to the list of entities found.
#' @examples
#' \dontrun{
#' w_SearchByIdentifiers(c("4938246", "36092166", "40787112"), Pauthority='P214')
#' w_SearchByIdentifiers(c("4938246", "36092166", "40787112"), Pauthority='P214', langsorder='en|fr')
#' }
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @export
w_SearchByIdentifiers <- function(id_list, Pauthority, langsorder='', nlimit=3000,
                                  debug=FALSE) {
  #
  authorities <- c('P214', 'P244', 'P950', 'P213', 'P245', 'P245', 'P268',
                   'P227', 'P227', 'P269', 'P269', 'P1006', 'P8189',
                   'P1565', 'P1207', 'P3065', 'P8179', 'P4787', 'P1015',
                   'P1015', 'P9984','P9984', 'P7293', 'P409', 'P4439')
  names(authorities) <- c('VIAF', 'LC', 'BNE', 'ISNI', 'JPG', 'ULAN', 'BNF',
                          'GND', 'DNB', 'SUDOC', 'idRefID', 'NTA', 'J9U',
                          'ELEM', 'NUKAT', 'RERO', 'CAOONL', 'NII', 'BIBSYS',
                          'NORAF', 'BNC', 'CANTIC', 'PLWABN', 'NLA', 'MNCARS')
  #
  if (!grepl("^P\\d+$",Pauthority)) {
    if (!Pauthority %in% names(authorities))
      stop(paste0("ERROR: Invalid value '",Pauthority,"' for parameter 'Pauthority'"))
    Pauthority <- authorities[Pauthority]
  }
  #
  n <- length(id_list)
  # Check limits to make chucked queries
  if (n > nlimit) {
    if (debug != FALSE)
      cat(paste0("INFO: The number of entities (", n,") exceeds nlimit (",
                 nlimit,')\n'), file=stderr())
    return (doChunks(w_SearchByIdentifiers, id_list, nlimit, Pauthority=Pauthority,
                     langsorder=langsorder, debug=debug))
  }
  #
  if (langsorder == '') {
    ss1 <- ''
    sq  <- ''
    ss2 <- ''
  }
  else {
    ss1 <- "?entityLabel ?entityDescription"
    ss2 <- "(GROUP_CONCAT(DISTINCT ?instancLabel; separator='|') as ?instanceofLabel)"
    langsorder <- gsub("|", ",", langsorder, fixed = T)
    sq = paste0('SERVICE wikibase:label {bd:serviceParam wikibase:language "',langsorder,'".
      ?entity rdfs:label ?entityLabel.
      ?entity schema:description ?entityDescription.
      ?instanc rdfs:label ?instancLabel.}')
  }
  #
  values <- paste0('"',paste0(id_list, collapse = '" "'), '"')
  #
  query <- paste0("SELECT DISTINCT ?id ?entity ",ss1,"
(GROUP_CONCAT(DISTINCT ?instanc; separator='|') as ?instanceof)
",ss2,"
WHERE {
  OPTIONAL {
    VALUES ?id {",values,"}
    OPTIONAL {?entity wdt:", Pauthority," ?id;
                      wdt:P31 ?instanc.}
      ", sq,"
  }
} GROUP BY ?id ?entity ", ss1 ,"\n")

  if (debug == "query")
    cat(query, file=stderr())
  #
  r <- w_query(query, format='csv', method='POST')
  if(is.null(r)){
    return(NULL)
  }

  r$entity  <- gsub('http://www.wikidata.org/entity/', '', r$entity)
  r$instanceof  <- gsub('http://www.wikidata.org/entity/', '', r$instanceof)
  rownames(r) <- r$id
  return(r)
}

#' Get entities that have identifier in a database or authorities' catalog.
#'
#' Get all Wikidata entities that have identifier in the database or
#' authorities' catalog indicated in the parameter `Pauthority`. Returns the
#' Wikidata entities. If parameter `langsorder`='', then no labels or
#' descriptions of the entities are returned, otherwise the function returns
#' them in the language order indicated in `langsorder`. Filtering is possible
#' if parameter `instanceof`!=''.
#' If only the number of entities which have identifier in the database or
#' authorities' catalog is needed, set `debug`='count'.
#'
#' @param Pauthority Wikidata property identifier of the database or
#' authorities' catalog. For example, if Pauthority = "P4439", all entities
#' which have an identifier in the MNCARS (Museo Nacional Centro de Arte Reina
#' Sofía) database are returnd. Following libraries abbreviation for the
#' databases can be also used in the parameter 'Pauthority':
#'
#' library   : VIAF, LC,   BNE , ISNI, JPG,  ULAN, BNF,  GND, DNB,
#' Pauthority: P214, P244, P950, P213, P245, P245, P268, P227,P227,
#'
#' library   : SUDOC, NTA,  J9U,   ELEM,  NUKAT, MNCARS
#' Pauthority: P269, P1006, P8189, P1565, P1207, P4439
#'
#' @param langsorder Order of languages in which the information will be
#' returned, separated with '|'. If no information is given in the first
#' language, next is used. If langsorder=='', then labels or descriptions are
#' not returned.
#' @param instanceof Wikidata entity of which the entities searched for are an
#' example or member of it (class). Optional. For example, if instanceof="Q5"
#' the search are filtered to Wikidata entities of class Q5 (human). Some
#' entity classes are allowed, separated with '|'.
#' @param nlimit If the number of entities in the database or authorities'
#' catalog exceeds this number, then query are made in chunks. The value can
#' increase if langorder=''. Please, reduce the default value if error is raised.
#' @param debug For debugging purposes (default FALSE). If debug='info'
#' information about chunked queries is shown. If debug='query' also the query
#' launched is shown. If debug='count' the function only returns the number of
#' entities with have identifier in that authority.
#' @return A data-frame with columns: 'entity', 'entityLabel', 'entityDescription',
#' 'instanceof', instanceofLabel' and the identifier in the "Pauthority" database.
#' Index of the data-frame is also set to the list of entities found.
#' @examples
#' \dontrun{
#' # Example: Pauthority=P4439 (has identifier in the Museo Nacional Centro de
#' # Arte Reina Sofía)
#' w_SearchByAuthority(Pauthority="P4439", debug='count')
#' mncars <- w_SearchByAuthority(Pauthority="P4439")
#' mncars <- w_SearchByAuthority(Pauthority="MNCARS", langsorder = 'es|en')
#' # Wikidata entities are not 'human' (Q5):
#' mncars[!grepl("\\bQ5\\b", mncars$instanceof), ]
#' # Wikidata entities are 'human' (Q5):
#' mncars <- w_SearchByAuthority(Pauthority="MNCARS", langsorder = 'es|en', instanceof='Q5')
#' }
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @export
w_SearchByAuthority <- function(Pauthority, langsorder='', instanceof='', nlimit=10000,
                                debug=FALSE) {
  #
  authorities <- c('P214', 'P244', 'P950', 'P213', 'P245', 'P245', 'P268',
                   'P227', 'P227', 'P269', 'P269', 'P1006', 'P8189',
                   'P1565', 'P1207', 'P3065', 'P8179', 'P4787', 'P1015',
                   'P1015', 'P9984','P9984', 'P7293', 'P409', 'P4439')
  names(authorities) <- c('VIAF', 'LC', 'BNE', 'ISNI', 'JPG', 'ULAN', 'BNF',
                          'GND', 'DNB', 'SUDOC', 'idRefID', 'NTA', 'J9U',
                          'ELEM', 'NUKAT', 'RERO', 'CAOONL', 'NII', 'BIBSYS',
                          'NORAF', 'BNC', 'CANTIC', 'PLWABN', 'NLA', 'MNCARS')
  #
  if (!grepl("^P\\d+$",Pauthority)) {
    if (!Pauthority %in% names(authorities))
      stop(paste0("ERROR: Invalid value '",Pauthority,"' for parameter 'Pauthority'"))
    Pauthority <- authorities[Pauthority]
  }
  #
  if (langsorder == '') {
    ss1 <- ''
    sq  <- ''
    ss2 <- ''
  }
  else {
    ss1 <- "?entityLabel ?entityDescription"
    ss2 <- "(GROUP_CONCAT(DISTINCT ?instancLabel; separator='|') as ?instanceofLabel)"
    langsorder <- gsub("|", ",", langsorder, fixed = T)
    sq = paste0('SERVICE wikibase:label {bd:serviceParam wikibase:language "',langsorder,'".
      ?entity rdfs:label ?entityLabel.
      ?entity schema:description ?entityDescription.
      ?instanc rdfs:label ?instancLabel.}')
  }
  #
  # First: known the number of entities
  query <- paste0('SELECT (COUNT(DISTINCT ?entity) AS ?count) WHERE {?entity wdt:',Pauthority,' [].}')
  d <- w_query(query, method='GET', format="csv")
  if(is.null(d)){
    return(NULL)
  }
  nq <- d$count[1]
  #
  if (debug=='count') {
    cat(paste0("INFO: The number of entities is ", nq, ".\n"), file=stderr())
    return(nq)
  }
  #
  nlim <- as.integer(nq/nlimit)
  if (nlim>0 && debug!=FALSE)
    cat(paste0("INFO: The number of entities (", nq, ") exceeds chunksize (", nlimit, ").\n"), file=stderr())
  output <- NULL
  for (k in 0:nlim){
    offset <- k*nlimit
    if ((offset+1) > nq)
      break
    if (nlim>0 && debug!=FALSE)
      cat(paste0("  INFO: Requesting elements from ", offset+1, " to ",
                 min(offset+nlimit, nq),"\n"), file=stderr())
    #
    query <- paste0("SELECT DISTINCT ?entity ",ss1,"
(GROUP_CONCAT(DISTINCT ?instanc; separator='|') as ?instanceof)
",ss2,"
(GROUP_CONCAT(DISTINCT STR(?authid);separator='|') as ?",Pauthority,")
WITH {
  SELECT DISTINCT ?entity ?authid
  WHERE {?entity wdt:",Pauthority," ?authid.}
  ORDER BY ?entity
  LIMIT ",nlimit," OFFSET ",offset,"
  } AS %results
WHERE {
  INCLUDE %results.
  ",sq,"
  OPTIONAL {?entity wdt:P31 ?instanc.}
} GROUP BY ?entity ", ss1 ,"\n")
    #
    if (debug == "query")
      cat(query, file=stderr())
    #
    r <- w_query(query, format='csv', method='POST')
    if(is.null(r)){
      next
    }
    r$entity  <- gsub('http://www.wikidata.org/entity/', '', r$entity)
    r$instanceof  <- gsub('http://www.wikidata.org/entity/', '', r$instanceof)
    rownames(r) <- r$entity
    if (k==0)
      output <- r
    else
      output <- rbind(output, r)
  }
  # Filter instanceof
  if (instanceof!='')
    output <- output[grepl(paste0('\\b(?:', instanceof, ")\\b"), output$instanceof), ]
  #
  return(output)
}

#' Get entities which are instance of a Wikidata entity
#'
#' Get all Wikidata entities which are instance of one o more Wikidata entities
#' like films, cities, etc. If parameter `langsorder`='', then no labels or
#' descriptions of the entities are returned, otherwise the function returns
#' them in the language order indicated in `langsorder`.
#' @param instanceof Wikidata entity of which the entities searched for are an
#' example or member of it (class). For example, if instanceof="Q229390" return
#' Wikidata entities of class Q229390 (3D films). More than one entities can be
#' included in the `instanceof` parameter, with '|' or '&' separator:
#'  - if '|' (instanceof='Q229390|Q202866') then the OR operator is used.
#'  - if '&' (instanceof='Q229390|Q202866') then the AND operator is used.
#'  Note that '|' and '&' cannot be present at the same time.
#' @param langsorder Order of languages in which the information will be
#' returned, separated with '|'. If no information is given in the first
#' language, next is used. If langsorder=='', then labels or descriptions are
#' not returned.
#' @param nlimit If the number of entities in the database or authorities'
#' catalog exceeds this number, then query are made in chunks. The value can
#' increase if langorder=''. Please, reduce the default value if error is raised.
#' @param debug For debugging purposes (default FALSE). If debug='info'
#' information about chunked queries is shown. If debug='query' also the query
#' launched is shown. If debug='count' the function only returns the number of
#' entities.
#' @return A data-frame. Index of the data-frame is also set to the list of entities found.
#' @examples
#' \dontrun{
#' w <- w_SearchByInstanceof('Q229390|Q25110269', langsorder = 'es|en')
#' w <- w_SearchByInstanceof('Q229390&Q25110269', langsorder = 'es|en')
#' }
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @export
w_SearchByInstanceof <- function(instanceof, langsorder='', nlimit=2500, debug=FALSE) {
  #
  searchOR <- searchAND <- FALSE
  if (!grepl("^Q\\d+([|&]Q\\d+)*$",instanceof))
    stop(paste0("ERROR: Invalid value '",instanceof,"' for parameter 'instanceof'"))
  if (grepl('|', instanceof, fixed=TRUE))
    searchOR = TRUE
  if (grepl('&', instanceof, fixed=TRUE))
    searchAND = TRUE
  if (searchOR & searchAND)
    stop(paste0("ERROR: Invalid value '",instanceof,"' for parameter 'instanceof'"))
  #
  if (langsorder == '') {
    ss1 <- ''
    sq  <- ''
    ss2 <- ''
  }
  else {
    ss1 <- "?entityLabel ?entityDescription"
    ss2 <- "(GROUP_CONCAT(DISTINCT ?instancLabel; separator='|') as ?instanceofLabel)"
    langsorder <- gsub("|", ",", langsorder, fixed = T)
    sq = paste0('SERVICE wikibase:label {bd:serviceParam wikibase:language "',langsorder,'".
      ?entity rdfs:label ?entityLabel.
      ?entity schema:description ?entityDescription.
      ?instanc rdfs:label ?instancLabel.}')
  }
  #
  # First: known the number of entities
  #
  if (searchOR) {
    iof <- strsplit(instanceof, '|', fixed = T)[[1]]
    query <- paste0('SELECT (COUNT(DISTINCT *) AS ?count) WHERE {\n',
                    paste0('{[] wdt:P31 wd:', iof, '}', collapse='\n UNION\n'),"\n}\n")
    queryF <- paste0('SELECT DISTINCT ?entity WHERE {\n',
                     paste0('{?entity wdt:P31 wd:', iof, '}', collapse='\n UNION\n'),"\n}\n")
  }
  else if (searchAND) {
    iof <- strsplit(instanceof, '&', fixed = T)[[1]]
    values <- values <- paste0("wd:", paste0(iof, collapse = ', wd:'))
    query <- paste0('SELECT (COUNT(DISTINCT *) AS ?count) WHERE {[] wdt:P31 ', values, '}\n')
    queryF <- paste0('SELECT DISTINCT ?entity WHERE {?entity wdt:P31 ', values, '}\n')
  }
  else {
    query <- paste0('SELECT (COUNT(DISTINCT *) AS ?count) WHERE {[] wdt:P31 wd:', instanceof, '}\n')
    queryF <- paste0('SELECT ?entity WHERE {?entity wdt:P31 wd:', instanceof, '}\n')
  }

  d <- w_query(query, method='GET', format="csv")
  if(is.null(d)){
    return(NULL)
  }
  nq <- d$count[1]
  #
  if (debug=='count') {
    cat(paste0("INFO: The number of entities is ", nq, ".\n"), file=stderr())
    return(nq)
  }
  #
  nlim <- as.integer(nq/nlimit)
  if (nlim>0 && debug!=FALSE)
    cat(paste0("INFO: The number of entities (", nq, ") exceeds chunksize (", nlimit, ").\n"), file=stderr())
  output <- NULL
  for (k in 0:nlim){
    offset <- k*nlimit
    if ((offset+1) > nq)
      break
    if (nlim>0 && debug!=FALSE)
      cat(paste0("  INFO: Requesting elements from ", offset+1, " to ",
                 min(offset+nlimit, nq),"\n"), file=stderr())
    #
    query <- paste0("SELECT DISTINCT ?entity ",ss1,"
(GROUP_CONCAT(DISTINCT ?instanc; separator='|') as ?instanceof)",
                    ss2,"
WITH {
", queryF,
"  ORDER BY ?entity
  LIMIT ",sprintf("%d",nlimit)," OFFSET ",sprintf("%d",offset),"
  } AS %results
WHERE {
  INCLUDE %results.",
                    sq,"
  OPTIONAL {?entity wdt:P31 ?instanc.}
} GROUP BY ?entity ", ss1 ,"\n")
    #
    if (debug == "query")
      cat(query, file=stderr())
    #
    r <- w_query(query, format='csv', method='POST')
    if(is.null(r)){
      next
    }
    r$entity  <- gsub('http://www.wikidata.org/entity/', '', r$entity)
    r$instanceof  <- gsub('http://www.wikidata.org/entity/', '', r$instanceof)
    rownames(r) <- r$entity
    if (k==0)
      output <- r
    else
      output <- rbind(output, r)
  }
  return(output)
}

#' Search Wikidata entities by string (usually labels)
#'
#' Search Wikidata entities in label and altLabel ("Also known as") or in any
#' part of the entity using different approaches.
#'
#' @param string String (label or altLabel) to search. Note that single
#' quotation mark must be escaped (string="O\\'Donell"), otherwise an error will
#' be raised.
#' @param mode The mode to perform search. Default 'inlabel' mode.
#'  - 'exact' for an exact search in label or altLabel using case sensitive
#'    search and differentiate diacritics. Languages in the parameter `lang` are
#'    used, so this parameter is mandatory using this mode.
#'  - 'startswith' for entities which label or altLabel starts with the string,
#'    similar to a wildcard search "string*". The string is searched in label in
#'    the languages of `lang` parameter, but in any language in altLabel, so
#'    parameter `lang` is also mandatory in this mode. Diacritics and case are
#'    ignored in this mode.
#'  - 'cirrus' search words in any order in any part of the entity (which must
#'    be a string), not only in label or altLabel. Diacritics and case are
#'    ignored. It is a full text search using the ElasticSearch engine.
#'    Phrase search can be used if launched with double quotation marks, for
#'    example, string='"Antonio Saura"'. Also fuzzy search is possible, for
#'    example, string="algermon~1" or string="algernon~2". Also REGEX search can
#'    be used (but it is a very limited functionality) using this format:
#'    string="insource:/regex/i" (i: is for ignore case, optional).
#'    In this mode, parameter `langs` is ignored.
#'  - 'inlabel' is an special case of 'cirrus' search for matching whole words
#'    (in any order) in any position in label or altLabel. With this mode no
#'    fuzzy search can be used, but some languages can be set in the `lang`
#'    parameter.
#'    Modes 'inlabel' and 'cirrus' use the CirrusSearch of the Wikidata API.
#'    Please, for more examples, see https://www.mediawiki.org/wiki/Help:CirrusSearch
#'    and https://www.mediawiki.org/wiki/Help:Extension:WikibaseCirrusSearch
#' @param langs Languages in which the information will be searched, using "|"
#' as separator. In 'exact' or 'startswith' modes this parameter is mandatory,
#' at least one language is required. In 'inlabel'mode, if the parameter `langs`
#' is set, then the search is restricted to languages in this parameter,
#' otherwise any language. In 'cirrus' mode this parameter is ignored.
#' @param langsorder Order of languages in which the information will be
#' returned, using "|" as separator. If `langsorder`='', no labels or
#' descriptions will be returned, otherwise, they are returned in the order of
#' languages in this parameter, if any.
#' @param instanceof Wikidata entity of which the entities searched for are an
#' example or member of it (class). For example, if instanceof='Q5' the
#' search are filtered to Wikidata entities of class Q5 (human). Some
#' entity classes are allowed, separated with '|'.
#' @param Pproperty Wikidata properties to search, separated with '|', mandatory.
#' For example, is Pproperty="P21", the results contain information of the sex
#' of entities. If Pproperty="P21|P569" also searches for birthdate. If
#' Pproperty='P21|P569|P214' also searches for VIAF identifier.
#' @param debug For debugging purposes (default FALSE). If debug='query' the
#' query launched is shown.
#' @param debug For debugging purposes (default FALSE). If debug='query' the
#' query launched is shown. If debug='count' the function only returns the
#' number of entities with that occupation.
#' @return A data-frame with 'entity', 'entityLabel', 'entityDescription',
#' (including 'instance', 'instanceLabel', 'altLabel' if mode="startswith")
#' and additionally the properties of Pproperty.
#' @examples
#' \dontrun{
#' df <- w_SearchByLabel(string='Iranzo', mode="exact", langs='es|en')
#' df <- w_SearchByLabel(string='Iranzo', mode="exact", langs='es|en',
#'                       langsorder='es|en', instanceof = 'Q5|Q101352')
#' ## Search entities which label or altLabel starts with "string"
#' df <- w_SearchByLabel(string='Iranzo', mode='startswith', lang='en', langsorder='es|en')
#' ## Search in any position in Label or AltLabel (diacritics and case are ignored)
#' df <- w_SearchByLabel(string='Iranzo', mode='inlabel', langsorder='es|en')
#' ## Search in Chinese (Simplified) (language code: zh) in any part of entity:
#' df <- w_SearchByLabel(string='\u4F0A\u5170\u4f50', mode='cirrus', langsorder='es|zh|en')
#' }
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @export
w_SearchByLabel <- function(string, mode='inlabel', langs="", langsorder='',
                            instanceof="", Pproperty="", debug=FALSE) {
  mode <- trimws(mode)
  string <- trimws(string)
  # string <- gsub("'", "\\'", trimws(string), fixed = T)
  #
  if (string=='' || !(mode %in% c('exact', 'startswith', 'inlabel', 'cirrus')))
    stop(paste0("ERROR. Parameter 'string' or 'mode' are invalid."))
  if (mode %in% c('exact', 'startswith') && langs=='')
    stop(paste0("ERROR. Parameter 'langs' is mandatory in mode '", mode, "'"))
  if (mode=='cirrus' && langs!='')
    cat(paste0("ERROR. In mode '",mode,"' the parameter 'langs' is ignored."), file=stderr())
  #
  p_show <- p_query <- p_lang <- ''
  if (Pproperty!='') {
    p_show  <- character()
    p_query <- character()
    p_lang <- character()
    for (p in strsplit(Pproperty, '|', fixed = T)[[1]]) {
      p_show <- append(p_show, paste0("(GROUP_CONCAT(DISTINCT ?", p ,"p;separator='|') as ?",p,")"))
      if (langsorder!='')
        p_show <- append(p_show, paste0("(GROUP_CONCAT(DISTINCT STR(?", p ,"label);separator='|') as ?",p,"Label)"))
      p_query <- append(p_query, paste0('  OPTIONAL {?entity wdt:', p, ' ?', p, 'p.}\n'))
      p_lang  <- append(p_lang, paste0('    ?', p,'p rdfs:label ?', p, 'label.\n'))
    }
    p_show  <- paste(p_show, collapse = "\n")
    p_query <- paste0(p_query, collapse = "")
    p_lang  <- paste0(p_lang, collapse = "")
  }

  if (langsorder == '') {
    ss1 <- ''
    ss2 <- p_show
    sq  <- p_query
  }
  else {
    ss1 <- "?entityLabel ?entityDescription"
    ss2 <- paste0("(GROUP_CONCAT(DISTINCT ?instancLabel; separator='|') as ?instanceofLabel)\n",
                  p_show)
    langsorder <- gsub('|', ',', langsorder, fixed=T)
    sq <- paste0(p_query, '  SERVICE wikibase:label {bd:serviceParam wikibase:language "',langsorder,'".
    ?entity rdfs:label ?entityLabel.
    ?entity schema:description ?entityDescription.
    ?instanc rdfs:label ?instancLabel.\n',
                 p_lang,'  }')
  }
  #
  # Start of the query
  query <- paste0("SELECT DISTINCT ?entity ", ss1,
                  "\n(GROUP_CONCAT(DISTINCT ?instanc; separator='|') as ?instanceof)\n",
                  ss2,"\n")
  #
  if (mode=="exact")  # UNION sentences with each language
  {
    l <- strsplit(langs, '|', fixed = T)[[1]]
    units <- paste0("{?entity rdfs:label '", string, "'@", l, '}')
    unionlabel <- paste0(units, collapse = "\nUNION\n")
    unitsalt <- paste0("{?entity skos:altLabel '", string, "'@", l, '}')
    unionaltLabel <- paste0(unitsalt, collapse = "\nUNION\n")
    #
    query <- paste0(query, "WHERE {\n", unionlabel,"\n UNION\n", unionaltLabel)
  }
  else if (mode=="startswith")
  {
    mwapi = character()
    for (l in strsplit(langs, '|', fixed = T)[[1]]) {
      mwapi <- append(mwapi,
                      paste0('SERVICE wikibase:mwapi {
  bd:serviceParam wikibase:api "EntitySearch";
                  wikibase:endpoint "www.wikidata.org";
                  mwapi:language "', l, '";
                  mwapi:search "', string, '" .
  ?entity wikibase:apiOutputItem mwapi:item.
  }'))}
    mwapi <- paste0(mwapi, collapse = "\n    UNION\n    ")
    #
    query <- paste0(query, "WITH {\n    SELECT DISTINCT ?entity\n     WHERE {\n",
                    mwapi, "    }\n  } AS %results\n  WHERE {\n    INCLUDE %results\n")
  }
  else if (mode %in% c('cirrus', 'inlabel')) {
    if (mode=='inlabel')
      string <- paste0('inlabel:', string)
    if (langs != '' && mode=='inlabel')
      string <- paste0(string, '@', sub('|', ',', langs, fixed=T))
    #
    query <- paste0(query, "WHERE {\n  SERVICE wikibase:mwapi {
    bd:serviceParam wikibase:api \"Search\";
                    wikibase:endpoint \"www.wikidata.org\";
                    mwapi:srsearch '", string, "'.
    ?entity wikibase:apiOutputItem mwapi:title.
  }")
  }
  else {
    stop("ERROR: wrong mode specified.")
  }
  #
  # The final part of the query:
  query <- paste0(query, "\n  OPTIONAL {?entity wdt:P31 ?instanc.}\n",
                  sq, "\n} GROUP BY ?entity ", ss1, "\n")
  #
  if (debug == "query")
    cat(query, file=stderr())
  #
  r <- w_query(query, format = "csv", method="GET")
  if(is.null(r)){
    return(NULL)
  }
  if (nrow(r) == 0)
    return(r)
  #
  for (e in c('entity',  'instanceof'))
    r[[e]] <- gsub('http://www.wikidata.org/entity/', '', r[[e]])
  if (Pproperty!='')
    for (e in strsplit(Pproperty, '|', fixed = T)[[1]])
      r[[e]] <- gsub('http://www.wikidata.org/entity/', '', r[[e]])
  rownames(r) <- r$entity
  #
  # Filter instanceof
  if (instanceof!='')
    r <- r[grepl(paste0('\\b(?:', instanceof, ")\\b"), r$instanceof), ]
  #
  return(r)
}


#' Get information about a Wikimedia entity (human or film)
#'
#' Get labels, descriptions and some properties of the Wikidata entities in
#' entity_list, for person or films. If person, the information returned is
#' about labels, descriptions, birth and death dates and places, occupations,
#' works, education sites, awards, identifiers in some databases, Wikipedia page
#' titles (which can be limited to the languages in the `wikilangs` parameter,
#' etc. If films, information is about title, directors, screenwriter,
#' castmember, producers, etc.
#'
#' @param entity_list The Wikidata entities to search for properties (person or
#' films.
#' @param mode In "default" mode, the list of entities is expected to correspond
#' to person, obtaining information related to person. If the mode is "film",
#' information related to films will be requested. If the mode is "tiny" less
#' properties are requested.
#' @param langsorder Order of languages in which the information will be
#' returned, separated with '|'. If no information is given in the first
#' language, next is used. For label and description, English is used for
#' language failback, if they are not in English, then information is returned
#' in any else language. The language for label and description are also
#' returned. If langsorder=='', then no other information than labels or
#' descriptions are returned in any language, only Wikidata entities, else,
#' use the order in this parameter to retrieve information.
#' @param wikilangs List of languages to limit the search of Wikipedia pages,
#' using "|" as separator. Wikipedias pages are returned in same order as
#' languages in this parameter. If wikilangs='' the function returns Wikipedia
#' pages in any language, not sorted.
#' @param nlimit If the number of entities exceeds this number, chunked queries
#' are done. This is the number of entities requested in each chunk. Please,
#' reduce the default value if error is raised.
#' @param debug For debugging (info or query)
#' @return A data-frame with the properties of the entity. Also index is set to
#' entity_list.
#' @importFrom collections dict
#' @examples
#' \dontrun{
#' df <- w_EntityInfo(entity_list='Q134644', langsorder='es|en')
#' df <- w_EntityInfo(entity_list='Q134644', langsorder='es|en', wikilangs='es|en|fr')
#' df <- w_EntityInfo(c('Q270510', 'Q1675466', 'Q24871'), mode='film',
#'                    langsorder='es|en', wikilangs='es|en|fr')
#' # Search string 'abba' inlabel
#' w <- w_SearchByLabel('abba', mode='inlabel', langsorder = '', instanceof = 'Q5')
#' df <- w_EntityInfo(w$entity, langsorder='en', wikilangs='en|es|fr', debug='info')
#' # Search 3D films
#' w <- w_SearchByInstanceof(instanceof='Q229390', langsorder = 'en|es', debug = 'info')
#' df <- w_EntityInfo(w$entity, mode="film", langsorder='en', wikilangs='en', debug='info')
#' }
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @export
w_EntityInfo <- function(entity_list, mode='default', langsorder='',
                         wikilangs="",  nlimit=MW_LIMIT, debug=FALSE) {
  # Check entity_list
  entity_list <- checkEntities(entity_list)
  #
  if (mode=='film') {
    # For these claims, more than one occurrences separated with '|' are token, except
    # for fields in fieldsonlyone, in which only the most referenced is taken
    fields <- c('P31', 'P577', 'P3383', 'P18', 'P10', 'P1476', 'P2047', 'P144',
                'P135', 'P136', 'P495', 'P364', 'P57', 'P58', 'P161', 'P725',
                'P1431', 'P344', 'P1040', 'P2554', 'P86', 'P162', 'P272',
                'P462', 'P180', 'P921', 'P166', 'P444', 'P214', 'P480', 'P345')
    names(fields) <- c('instanceof', 'pubdate', 'poster', 'pic', 'video', 'title',
                       'duration', 'basedon', 'movement', 'genre', 'country',
                       'originallanguage', 'director', 'screenwriter', 'castmember',
                       'voiceactor', 'executiveproducer', 'photographdirector',
                       'filmeditor', 'productiondesigner', 'composer', 'producer',
                       'productioncompany', 'color', 'depicts', 'mainsubject',
                       'award', 'reviewscore', 'VIAF', 'FilmAffinity', 'IMDb')
    # For this claims, only the most referred is taken (note, however that the
    # preferred has priority).
    fieldsonlyone <- c('P577', 'P1476', 'P2047')
    # Note that wikipedias is added at last as new column of the dataframe
    columns <- c('entity', 'status', 'labellang', 'label', 'descriptionlang',
                 'description', 'instanceofQ', 'instanceof', 'pubdate', 'pubyear',
                 'poster', 'pic', 'video', 'duration', 'title', 'basedonQ',
                 'basedon', 'movementQ', 'movement', 'genreQ', 'genre',
                 'countryQ', 'country', 'originallanguageQ', 'originallanguage',
                 'directorQ', 'director', 'screenwriterQ', 'screenwriter',
                 'castmemberQ', 'castmember', 'voiceactorQ', 'voiceactor',
                 'executiveproducerQ', 'executiveproducer',
                 'photographdirectorQ', 'photographdirector', 'filmeditorQ', 'filmeditor',
                 'productiondesignerQ', 'productiondesigner', 'composerQ', 'composer',
                 'producerQ', 'producer', 'productioncompanyQ', 'productioncompany',
                 'colorQ', 'color', 'depictsQ', 'depicts', 'mainsubjectQ', 'mainsubject',
                 'awardQ', 'award', 'reviewscore', 'VIAF', 'FilmAffinity', 'IMDb',
                 'wikipedias')
  }
  else {
    # For these claims, more than one occurrences separated with '|' are token, except
    # for fields in fieldsonlyone, in which only the most referenced is taken
    fields <- c('P31', 'P18', 'P21', 'P69', 'P106', 'P101', 'P135', 'P136',
                'P737', 'P800', 'P463', 'P166', 'P214', 'P950', 'P4439',
                'P19', 'P20', 'P569', 'P570')
    names(fields) <- c('instanceof', 'pic', 'sex', 'educatedat', 'occupation',
                       'fieldofwork', 'movement', 'genre', 'influencedby', 'notablework',
                       'memberof', 'award', 'viafid', 'bneid', 'mncarsid', 'bplace',
                       'dplace', 'bdate', 'ddate')
    # For this claims, only the most referred is taken (note, however that the
    # preferred has priority).
    fieldsonlyone <- c('P19', 'P20', 'P569', 'P570')
    # Note that Wikipedias is added at last as new column of the dataframe
    columns <- c('entity', 'status', 'labellang', 'label', 'descriptionlang',
                 'description', 'instanceofQ', 'instanceof', 'sexQ', 'sex',
                 'bdate', 'byear', 'bplaceQ', 'bplace', 'bplaceLat', 'bplaceLon', 'bcountryQ', 'bcountry',
                 'ddate', 'dyear', 'dplaceQ', 'dplace', 'dplaceLat', 'dplaceLon', 'dcountryQ', 'dcountry',
                 'occupationQ', 'occupation', 'notableworkQ', 'notablework', 'educatedatQ', 'educatedat',
                 'fieldofworkQ', 'fieldofwork', 'movementQ', 'movement', 'genreQ', 'genre',
                 'influencedbyQ', 'influencedby', 'memberofQ', 'memberof', 'awardQ', 'award',
                 'viafid', 'bneid', 'mncarsid', 'pic', 'wikipedias')
  }
  #
  llangs <- strsplit(langsorder, '|', fixed = T)[[1]]
  # Failback English language
  if (!('en' %in% llangs))
    llangs <- append(llangs, 'en')
  langsorder <- paste0(llangs, collapse = '|')
  #
  wlangs <- character()   # wiki_langs
  if (wikilangs!='')
    wlangs <- strsplit(wikilangs, '|', fixed = T)[[1]]
  #
  d <- list()
  qidsoflabels <- character()  # store entities for searching labels
  qidsofplaces <- character()  # store entities for searching places
  #
  nq <- length(entity_list)
  # Check limits to make chucked queries
  nlim <- as.integer(nq/nlimit)
  if (nlim>0 && debug!=FALSE)
    cat(paste0("INFO: The number of entities (",nq ,") exceeds chunksize (", nlimit, ").\n"), file=stderr())
  for (k in 0:nlim)  {
    offset <- k*nlimit
    if ((offset+1) > nq)
      break
    t_list <- entity_list[(offset+1):min(nq,offset+nlimit)]
    if (nlim>0 && debug!=FALSE)
      cat(paste0("  INFO: Requesting elements from ", offset+1, " to ", offset+length(t_list),"\n"), file=stderr())
    #
    query <- list(format = 'json',
                  action = 'wbgetentities',
                  props  = 'labels|descriptions|claims|sitelinks',
                  ids    = paste0(t_list, collapse = '|'))
    #
    # Note that if wikilangs='', not only wikipedia, also other projects are returned.
    if (wikilangs!="")
      query["sitefilter"] = paste0(paste0(wlangs, 'wiki'), collapse='|')
    #
    j <- reqMediaWiki(query=query, project="www.wikidata.org", debug=debug)
    #
    if (is.null(j) || is.null(j$success) || j$success != 1)
      stop(paste0("Error in w_EntityInfo: reqMediaWiki returns an improper JSON."))
    #
    entities <- j$entities
    for (qid in names(entities)) {
      data <- entities[[qid]]
      #
      # if (debug!=FALSE)
      #   cat(paste0("  INFO: Requesting element ", qid,"\n"), file=stderr())
      #
      qiddict <- collections::dict(rep(NA, length(columns)), columns)   # set NA values, not NULL
      qiddict$set('entity', qid)
      qiddict$set('status', 'OK')
      # Check if entity is missing
      if (!is.null(data$missing)) {
        qiddict$set('status', 'missing')
        if (debug!=FALSE)
          cat(paste0("INFO: ",qid, " is missing\n"),file=stderr())
        next
      }
      # Check if entity is a redirection to other entity
      if (!is.null(data$redirects)) {
        qid_r <- data$id
        qiddict$set('status', qid_r)
        if (debug!=FALSE)
          cat(paste0("INFO: ",qid, " redirects to ", qid_r,
                     ", so all information returned is about ", qid_r, "\n"),file=stderr())
        if (debug!=FALSE && qid_r %in% entity_list)
          cat(paste0(" INFO: also note that ",qid_r,
                     " also is in the original entity list\n"), file=stderr())
      }
      #
      # Get the label and the description of the entity. Retrieves the first
      # in language order, or any one else, if exist, otherwise NA
      for (ld in c('label', 'description')) {
        item <- paste0(ld, 's')
        if (is.null(data[[item]]) || length(data[[item]])==0) {
          qiddict$set(paste0(ld, "lang"), NA)
          qiddict$set(ld, NA)
          next
        }
        lds <- data[[item]]
        existslang <- FALSE
        for (lang in llangs) {
          if (!is.null(lds[[lang]]) && is.null(lds[[lang]][['for-language']])) {
            existslang <- TRUE
            qiddict$set(paste0(ld,'lang'), lds[[lang]]$language)
            qiddict$set(ld, lds[[lang]]$value)
            break
          }
        }
        # If no label/description in any lang of langsorder, take the first one
        if (!existslang) {
          qiddict$set(paste0(ld,'lang'), lds[[1]]$language)
          qiddict$set(ld, lds[[1]]$value)
        }
      } # close: # Get the label and the description of the entity.

      # Processing claims
      claims <- data$claims
      #
      for (fname in names(fields)) {
        f <- fields[[fname]]
        values <- character()
        nrefs  <- integer()
        is_entity <- FALSE    # If the claim stores a Qxxx, add it to the corresponding "xxxQ" field
        for (item in claims[[f]]) {
          if (is.null(item$mainsnak$datavalue))  # Some information exists about claim, but unknown value is in the claim
            next
          valuetype <- item$mainsnak$datavalue$type
          value     <- item$mainsnak$datavalue$value
          # count references
          if (!is.null(item$references))
            nrefs <- append(nrefs, length(item$references) )
          else
            nrefs <- append(nrefs, 0)
          #
          if (valuetype == 'string')
            v <- value
          else if (valuetype == 'wikibase-entityid') {
            is_entity <- TRUE
            v <- value$id
            # Store qids to search labels later (not for places, because labels
            # are returned when retrieve places)
            if (!(f %in% c('P19', 'P20')))
              qidsoflabels <- append(qidsoflabels, v)
          }
          else if (valuetype == 'time')
            v <- value$time
          else if (valuetype == 'monolingualtext')
            v <- paste0(value$text, ':', value$language)
          else if (valuetype == 'quantity') {
            unit <- value$unit
            unit <- sub('http://www.wikidata.org/entity/(Q.*)', "\\1", unit)
            qidsoflabels <- append(qidsoflabels, unit)
            v <- paste0(value$amount, ' : ', unit)
          }
          else
            cat(paste0("WARNING w_EntityInfo: valuetype ", valuetype, " not implemented"), file=stderr())
          #
          # Check reviewers for P444 (review score)
          if (f=='P444' && !is.null(item$qualifiers) && !is.null(item$qualifiers$P447)) {
            reviewerQ <- item$qualifiers$P447[[1]]$datavalue$value$id
            qidsoflabels <- append(qidsoflabels, reviewerQ)
            v <- paste0(v, " [", reviewerQ, "]")
          }
          # Rank=preferred takes preference
          if (!is.null(item$rank) && item$rank=='preferred') {
            values <- c(v)
            nrefs  <- c(0)
            break
          }
          else
            values <- append(values, v)
        }

        if (length(values)!=0) { # The claim almost has one not erroneous value
          if (!(f %in% fieldsonlyone))
            qiddict$set(fname, paste0(unique(values), collapse = '|'))
          else {
            # Get the most referred value
            index_max <- which.max(nrefs)
            v <- values[index_max]
            qiddict$set(fname, v)
          }
          # Store qids to search places later
          if (f %in% c('P19', 'P20'))
            qidsofplaces <- append(qidsofplaces, v)
          # Extract year from bdate/ddate/pubdate:
          if (f %in% c('P569', 'P570', 'P577') && (f %in% fields)) {
            ff <- sub("date", "year", fname)
            qiddict$set(ff, sub("^.(.{4}).+$","\\1", v))
          }
          # if is_entity add qids to the corresponding "Q" field
          if (is_entity)
            qiddict$set(paste0(fname,'Q'), qiddict$get(fname))
        }
      }

      # Wikipedias
      sitelinks <- data$sitelinks
      if (length(sitelinks)>0) {
        wvalues <- character()
        if (length(wlangs)==0) {
          for (wlwiki in names(sitelinks)) {
            if (endsWith(wlwiki, 'wiki')) {
              wl <- sub("^(.+).{4}$", "\\1", wlwiki)
              title <- gsub(" ", "_", sitelinks[[wlwiki]]$title)
              title <- URLencode(title)
              wvalues <- append(wvalues, paste0('https://',wl,'.wikipedia.org/wiki/',title))
            }
          }
        }
        else {
          for (wl in wlangs) {
            wlwiki <- paste0(wl, 'wiki')
            if (wlwiki %in% names(sitelinks)) {
              title <- gsub(" ", "_", sitelinks[[wlwiki]]$title)
              title <- URLencode(title)
              wvalues <- append(wvalues, paste0('https://',wl,'.wikipedia.org/wiki/',title))
            }
          }
        }
        if (length(wvalues) > 0)
          qiddict$set('wikipedias', paste0(wvalues, collapse = '|'))
      }
      # Add URL parts to the pic to download, if any:
      for (fname in c('poster', 'pic', 'video')) {
        if (qiddict$has(fname) && !is.na(qiddict$get(fname))) {
          pic <- qiddict$get(fname)
          pics <- strsplit(pic, '|', fixed = T)[[1]]
          pics <- URLencode(sub(" ","_", pics))
          pics <- paste('https://commons.wikimedia.org/wiki/Special:FilePath/',pics, sep='')
          qiddict$set(fname, paste0(pics, collapse = '|'))
        }
      }
      #
      # Adding qiddict to d
      d <- append(d, qiddict)
      #
    } ## close: for (qid in names(entities))
    #
  } ## close: for (k in 0:nlim)
  #
  names(d) <- entity_list
  #
  # Only for mode='default': Search places for labels, coords (Latitude and Longitude) and countries
  placesdict <- collections::dict()
  if (length(qidsofplaces) > 0) {
    qidsofplaces <- unique(qidsofplaces)
    if (debug!=FALSE)
      cat(paste0("INFO: Searching labels, latitude and longitude coordinates, and countries for places.\n"), file=stderr())
    places <- w_Geoloc(qidsofplaces, langsorder=langsorder, debug=debug)
    # Set colnames in right form
    colnames(places) <- c('placeQ', 'place', 'placeLat', 'placeLon', 'countryQ', 'country')
    # create a dict of dicts for fast allocating
    for (qplace in rownames(places)) {
      placesdict$set(qplace, collections::dict(items=places[qplace, ], keys=colnames(places)))
    }
    # Add labels, geo-coordinates and country to entities
    for (qid in names(d)) {
      # data <- d[[qid]]
      for (bd in c('b','d')){
        fname <- paste0(bd,'placeQ')  # bplaceQ / dplaceQ
        if(!is.na(d[[qid]]$get(fname)) && placesdict$has(d[[qid]]$get(fname))){
          pdict <- placesdict$get(d[[qid]]$get(fname))    # get the Qxxx of the bplaceQ or dplaceQ
          d[[qid]]$update(collections::dict(items=pdict$values(),
                                            keys=paste0(bd, pdict$keys())))
        }
      }
    }
  }
  # Search labels for the rest of Qxxx entities
  labelsdict <- collections::dict()
  if (length(qidsoflabels) > 0) {
    qidsoflabels <- unique(qidsoflabels)
    if (debug!=FALSE)
      cat(paste0("INFO: Searching labels for Wikidata entities.\n"), file=stderr())
    labels <- w_LabelDesc(qidsoflabels, what='L', langsorder=langsorder, debug=debug)
    if(!is.null(labels)){
      # create a dict of labels for fast allocating
      labelsdict <- collections::dict(items=labels$label, keys=rownames(labels))
      # Add labels to entities
      for (qid in names(d)) {
        data <- d[[qid]]
        for (fname in data$keys()) {
          if (is.na(data$get(fname)))
            next
          if (endsWith(fname,'Q') && !(fname %in% c('bplaceQ', 'dplaceQ', 'bcountryQ', 'dcountryQ'))) {
            # Get the correct field (without "Q")
            fnn <- sub("^(.+)Q$","\\1", fname)
            # Replace all occurrences of Qxxx in those fields
            qx <- strsplit(data$get(fnn), '|', fixed = T)[[1]]
            labelsx <- lapply(qx, function(x) {labelsdict$get(x)})
            d[[qid]]$set(fnn, paste0(labelsx, collapse='|'))
          }
          if (fname == 'duration' && !is.na(data$get('duration'))) {     # data['duration'] = '+125 (Q7727)'
            duration <- data$get('duration')
            if (grepl('Q\\d+', duration))
              durationq <- regmatches(duration, regexec('Q\\d+', duration))[[1]]
            data$set('duration', sub(': Q\\d+', labelsdict$get(durationq), duration))
          }
          if (fname == 'reviewscore' && !is.na(data$get('reviewscore'))) {  # data['reviewscore'] = '99/00 [Q1234]'
            rsl <- character()
            for (rs in strsplit(data$get('reviewscore'), '|', fixed = T)[[1]]) {
              if (grepl('Q\\d+', rs)) {
                rsq <- regmatches(rs, regexec('Q\\d+', rs))[[1]]
                rsl <- append(rsl, sub('Q\\d+', labelsdict$get(rsq), rs))
              }
            }
            data$set('reviewscore', paste0(rsl, collapse = '|'))
          }
        }
      }
    }
  }
  # Finally, convert the dict of dicts to a dataframe
  df <- as.data.frame(do.call(rbind,lapply(d, function(x) unlist(x$as_list()))))

  if(mode=="tiny"){
    tinycolumns <- c('entity', 'labellang', 'label', 'descriptionlang', 'description', 'sex',
        'bdate', 'byear', 'bplace', 'bplaceLat', 'bplaceLon', 'bcountry',
        'ddate', 'dyear', 'dplace', 'dplaceLat', 'dplaceLon', 'dcountry',
        'occupationQ', 'occupation', 'viafid', 'bneid', 'mncarsid', 'pic', 'wikipedias')
    df <- df[,intersect(tinycolumns,colnames(df))]
  }

  return(df)
}


# MediaWiki API ----
# See https://www.mediawiki.org/wiki/API:Main_page
# See https://en.wikipedia.org/w/api.php  (xtools.wmcloud.org)
# -----------------------------------------------------------.

#' Uses httr package to retrieve responses using the MediaWiki API.
#'
#' For MediaWiki requests only user_agent is necessary in the request headers.
#' See https://www.mediawiki.org/wiki/API:Etiquette. The standard and default
#' output format in MediaWiki is JSON. All other formats are discouraged. The
#' output format should always be specified using the request param "format"
#' in the "query" request.
#' See https://www.mediawiki.org/wiki/API:Data_formats#Output.
#' @param query A list with de (key, values) pairs with the search.
#' Note that if titles are included in the query, the MediaWiki API has a
#' limit of 50 titles in each query. If number of titles is greater than this
#' limit a error is raised.
#' @param project The Wikimedia project to search. Default en.wikipedia.org.
#' @param method The method used in the httr request. Default 'GET'.
#' Note in "https://www.mediawiki.org/wiki/API:Etiquette#Request_limit":
#' "Whenever you're reading data from the web service API, you should try to use
#' GET requests if possible, not POST, as the latter are not cacheable."
#' @param attempts On ratelimit errors, the number of times the request is
#' retried using a 60 seconds interval between retries. Default 2. If 0 no
#' retries are done.
#' @param debug For debugging purposes (default FALSE). If debug='info'
#' information about chunked queries is shown. If debug='query' also the query
#' launched is shown.
#' @return The response in JSON format, raise exception on errors.
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @importFrom httr GET POST content add_headers stop_for_status
#' @importFrom jsonlite fromJSON
#' @importFrom utils URLdecode
reqMediaWiki <- function(query, project='en.wikipedia.org', method='GET',
                         attempts=2, debug=FALSE) {
  # Check for titles in query
  if ('titles' %in% names(query)) {
    titles <- strsplit(query$titles, '|', fixed = T)[[1]]
    n <- length(titles)
    if (n > MW_LIMIT)
      stop(paste0("ERROR: number of titles (", n,") exceeds Wikipedia API limit (",MW_LIMIT,")\n"))
  }
  # Check for ids in query
  if ('ids' %in% names(query)) {
    ids <- strsplit(query$ids, '|', fixed = T)[[1]]
    n <- length(ids)
    if (n > MW_LIMIT)
      stop(paste0("ERROR: number of ids (", n,") exceeds Wikipedia API limit (",MW_LIMIT,")\n"))
  }
  #
  url = paste0('https://', project, "/w/api.php")
  if(!curl::has_internet() || httr::http_error(url,query = query)){
    message("No internet connection or data source broken.")
    return(NULL)
  }
  nt <- 0
  tryCatch( {
    repeat {
      nt <- nt + 1
      if (method=='GET')
        r <- httr::GET(
          url = url,
          query = query,
          httr::user_agent(user_agent)
        )
      else if (method=='POST')
        r <- httr::POST(
          url = url,
          body = query,
          httr::add_headers(user_agent = user_agent),
          encode = 'form'
        )
      else stop(paste0("ERROR: method '", method, "' is not supported."))
      #
      if (debug == "query")
        cat(paste0(URLdecode(r$request$url), "\n"), file=stderr())
      #
      # Converts http errors to R errors or warnings
      httr::stop_for_status(r)
      content <- httr::content(r, as = "text", encoding = "UTF-8")
      j <- jsonlite::fromJSON(content, simplifyVector = FALSE)
      #
      # See https://www.mediawiki.org/wiki/API:Etiquette#Request_limit
      if (!is.null(j$error)) {
        if (j$error$code == 'ratelimited') {
          if (nt > attempts)
            stop(paste0(as.character(nt)," ratelimited attempts achieved, aborting.\n"))
          else {
            t <- 60*nt
            cat(paste0("INFO: ", nt," ratelimited attempt(s) error. Sleeping", as.character(t), "seconds.\n"),
                file = stderr())
            Sys.sleep(t)
          }
        }
        else {
          stop(paste0("ERROR: reqMediaWiki(): ", j$error$code, ": ", j$error$info))
        }
      }
      else {
        return(j)
      }
    }
  }, error = function(e){
    cat(as.character(e), file=stderr())
    return(NULL)
  } ) # End tryCatch
}

#' Return the normalized and redirect title from the response
#'
#' Return the normalized and the redirect title (also normalized), if any, from
#' the query part of the JSON response of a MediaWiki search. The response of
#' the MediaWiki API query (https://www.mediawiki.org/wiki/API:Query) includes
#' original page titles and possibily normalized and redirected titles, if the
#' API needs to obtain them. For a original title, this function returns them,
#' if any.
#' @param title The title likely to be found in q.
#' @param q The query part of the JSON response (j\['query'\]) from a Mediawiki
#' search. Note that this part contains some titles, so it is necessary to
#' search the original "title" in that part.
#' @return A vector with the normalized or redirected page title (target, also
#' normalized) found for the title.
normalizedTitle <- function(title, q) {
  # if ( !(jsonlite::validate(jsonlite::toJSON(q))) ) ## Very slow!!!
  #    stop("ERROR: j is not a valid JSON structure.")
  #
  anorm <- title
  # Is normalized (and possibly encoded)? # See https://phabricator.wikimedia.org/T29849#2594624
  # It is supposed that order is: first, NFC normalization, then, uppercase normalization.
  if (!is.null(q$normalized)) {
    for (nn in q$normalized) {
      if (nn$fromencoded && URLencode(anorm) == nn$from)  # NFC normalization
        anorm <- nn$to
      if (nn$from == anorm)  { # Uppercase normalization
        anorm <- nn$to
        break
      }
    }
  }
  # Is a redirect? The normalized titles is used.
  if (!is.null(q$redirects)) {
    for (nn in q$redirects)
      if (nn$from == anorm) {
        anorm <- nn$to
        break
      }
  }
  return(anorm)
}

#' checkTitles(titles)
#' Check if titles are valid. Return TRUE is all titles are valid, else FALSE.
#' See https://en.wikipedia.org/wiki/Wikipedia:Page_name#Technical_restrictions_and_limitations
#' @param titles A vector of titles to check.
checkTitles <- function(titles){
  if (length(titles) == 0)
    return(FALSE)
  #
  for (title in titles) { # Escaped REGEX: #<>[]|{} ---> #<>\\[\\]\\\{\\}
    if (grepl('[#<>\\[\\]|\\{\\}]', title)) {
      cat(paste0('ERROR: the title ', title, ' has a not allowed character\n'), file=stderr())
      return(FALSE)
    }
  }
  return(TRUE)
}

#' Open search of a string
#'
#' Search string in the content of the project page using OpenSearch. Only in
#' namespace 0. Please, see https://www.mediawiki.org/wiki/API:Opensearch for
#' further information.
#' @param string String to search.
#' @param project Wikimedia project, defaults "en.wikipedio.org".
#' @param profile This parameter sets the search type: classic,
#' engine_autoselect (default), fast-fuzzy, fuzzy, fuzzy-subphrases, normal,
#' normal-subphrases, and strict.
#' @param redirects If redirects='return', the page title is the normalized one
#' (also the URL). If redirects='resolve", the page title is the normalized and
#' resolved redirection is in effect (also the URL). Note that in both cases the
#' API performs a NFC Unicode normalization on search string.
#' @return A data-frame of page titles and URL returned. If error, return Null.
#' @note Only for namespace 0. The function also obtains redirections for
#' disambiguation pages.
#' @examples
#' # Some search profiles:
#' df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org',
#'                     profile="engine_autoselect", redirects="resolve")
#' df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org', profile="strict")
#' df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org', profile="fuzzy")
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @export
m_Opensearch <- function(string, project='en.wikipedia.org',
                         profile="engine_autoselect", redirects="resolve") {
  #
  query = list(format        = 'json',
               formatversion = '2',
               action        = 'opensearch',
               namespace     = '0',
               limit         = 'max',  # "max" is 500, but in practice it is 100
               profile       = profile,
               redirects     = redirects,
               search        = string)
  #
  j <- reqMediaWiki(query, project)
  #
  if (!is.null(j$error)) {
    cat(paste0('ERROR: ', j$error$code,': ', j$error$info,'\n'),
        file = stderr())
    return(NULL)
  }
  # NFC unicode normalization is carried out on string
  # if (j[[1]][1] != stringi::stri_trans_nfc(string)) {
  #   cat('ERROR: return string is different to the search string.\n',
  #       file = stderr())
  #   return(NULL)
  # }
  #
  output <- data.frame(title = unlist(j[[2]]), urls = unlist(j[[4]]))
  return(output)
}

#' Retrieve responses using the MediaWiki API.
#'
#' Use the MediaWiki API to check Wikipedia pages titles, get redirections of
#' Wikipedia pages, get image URL of Wikipedia pages or get URL of files in
#' Wikipedia pages
#' @param titles A vector of page titles to search for.
#' @param mode Select an action to perform:
#' 'wikidataEntity' ->
#' Use reqMediaWiki to check if page titles are in a Wikimedia project and returns
#' the Wikidata entity for them. Automatically resolves redirects if parameter
#' redirects = TRUE (default). If a page title exists in the Wikimedia project,
#' the status column in the returned data-frame is set to 'OK'. If a page is a
#' disambiguation page, that column is set to 'disambiguation', and if a title
#' is not in the Wikimedia project, it is set to 'missing' and no Wikidata
#' entity is returned;
#' 'redirects' ->
#' Obtains redirection of pages of the article titles in the Wikimedia project
#' restricted to namespace 0. Returns a vector for each title, in each vector the
#' first element is the page destiny, the rest are all pages that redirect to it. If
#' a title is not in the Wikimedia project its list is NA;
#' 'pagePrimaryImage' ->
#' Return the URL of the image associated with the Wikipedia pages of the titles,
#' if pages has one. Automatically resolves redirects, the "normalized" column
#' of the returned data-frames contains the destiny page of the redirection.
#' See https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bpageimages;
#' 'pageFiles' ->
#' Search for URL of files inserted in Wikipedia pages. Exclude extensions
#' in exclude_ext. Note that the query API named this search as 'images',
#' but all source files in the page are returned. The function only return URL
#' that not end with extensions in exclude_ext parameter (case insensitive).
#' Automatically resolves redirects, the "normalized" column of the returned
#' data-frame contains the destiny page of the redirection.
#' See https://en.wikipedia.org/w/api.php?action=help&modules=query%2Bimages
#' @param project Wikimedia project, defaults "en.wikipedia.org"
#' @param redirects If page redirects must be resolved. If redirects=TRUE
#' (default) then the "normalized" column of the returned data-frames contains
#' the destiny page title of the redirection. Only for mode=wikidataEntity.
#' @param exclude_ext File extensions excluded in results.
#' Only for mode=PageFiles. Default 'svg|webp|xcf'
#' @return depends on the mode selected:
#' 'wikidataEntity' Null if there is any error in response, else a data-frame
#' with four columns: first, the original page title string, second, the
#' normalized one, third, logical error=FALSE, if Wikidata entity exists for
#' the page, or error=TRUE it does not, last, the Wikidata entity itself or a
#' clarification of the error;
#' 'redirects' A vector for each title, with all pages that are redirects to the
#' first element;
#' 'pagePrimaryImage' A data-frame with original titles, normalized ones, the
#' status of the pages and the primary image of the page or NA if it does not
#' exist;
#' 'pageFiles' A data-frame with original titles, the normalized ones, status
#' for the page and the URL files of the Wikipedia pages, using use "|" to
#' separate ones) or NA if files do not exits or are excluded.
#' @examples
#' # Note that URLdecode("a%CC%8C") is
#' # the letter "a" with the combining caron
#' df <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
#'                     mode='wikidataEntity', project='en.wikipedia.org')
#' a <- m_reqMediaWiki(c('Cervantes', 'Planck', 'Noexiste'), mode='redirects',
#'                     project='es.wikipedia.org')
#' i <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
#'                     mode='pagePrimaryImage')
#' f <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
#'                     mode='pageFiles', exclude_ext = "svg|webp|xcf")
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @export
m_reqMediaWiki <- function(titles,
                           mode=c('wikidataEntity','redirects','pagePrimaryImage','pageFiles'),
                           project='en.wikipedia.org', redirects=TRUE, exclude_ext='svg|webp|xcf'){
  mode <- mode[1]
  if(mode=='wikidataEntity'){
    return(m_WikidataEntity(titles,project,redirects))
  }else if(mode=='redirects'){
    return(m_Redirects(titles,project))
  }else if(mode=='pagePrimaryImage'){
    return(m_PagePrimaryImage(titles,project))
  }else if(mode=='pageFiles'){
    return(m_PageFiles(titles,project,exclude_ext))
  }else{
    stop("mode: wrong mode specified")
  }
}

# Check Wikipedia pages titles
m_WikidataEntity <- function(titles, project='en.wikipedia.org',
                             redirects=TRUE) {
  if (!checkTitles(titles))
    return(NULL)
  #
  # Check the limit on number of titles of Wikidata API (50 titles)
  n <- length(titles)
  nlimit <- 50
  output <- NULL
  if (n > nlimit) {
    cat(paste0("INFO: The number of titles (", n,") exceeds Wikipedia API limit (",
               nlimit,"): doing chunked queries.\n"),
        file = stderr())
    nlim <- as.integer(n/nlimit)
    for (k in 0:nlim) {
      offset <- nlimit * k
      if ((offset+1) > n)
        break
      t_list <- titles[(offset+1):min(n,offset+nlimit)]
      cat(paste0(" INFO: Getting information of entities from ", offset+1," to ", offset+length(t_list), '.\n'),
          file = stderr())
      aux <- m_WikidataEntity(t_list, project, redirects)
      #
      if (k==0)
        output <- aux
      else
        output <- rbind(output, aux)
    }
    return(output)
  }
  #
  query <- list(format        = 'json',
                formatversion = '2',
                action        = 'query',
                # redirects     = '1',
                prop          = 'pageprops',
                ppprop        = 'wikibase_item|disambiguation',
                titles        = paste0(titles, collapse = '|'))
  if (redirects)
    query["redirects"] = 1
  #
  # vectors for loading the return information
  normalized <- character()
  status     <- character()
  entity     <- character()

  repeat {
    j <- reqMediaWiki(query, project)
    #
    if (is.null(j) || is.null(j$query)){
      cat("Error in m_WikidataEntity: reqMediaWiki returns an improper JSON.",
          file = stderr())
      return(NULL)
    }
    #
    q <- j$query
    #
    for (a in titles) {
      anorm <- normalizedTitle(a, q)
      # Check if a different title with identical previously normalized title
      # is already in list. It's necessary for stability on number of elements.
      index <- match(anorm, normalized)
      if (!is.na(index)) {
        cat(paste0("   INFO: '",anorm,"' there was already in list.\n"), file = stderr())
        normalized <- append(normalized, anorm)
        status     <- append(status, status[index])
        entity     <- append(entity, entity[index])
        next
      }
      #
      for (i in 1:length(q$pages)) { # With formatversion=2, j$json$query$pages is a vector
        page <- q$pages[[i]]
        item <- NA
        if (anorm == page$title) {
          aerror <- TRUE
          if (!is.null(page$invalid))   # ¿malformed title?
            s <- 'invalid'
          else if (!is.null(page$missing))
            s <- 'missing'
          else if (is.null(page$pageprops))
            s <- 'no_pagepros'
          else {
            if (is.null(page$pageprops$wikibase_item))
              s <- "no_wikibase_item"
            else
              item <- page$pageprops$wikibase_item
            s <- ifelse(!is.null(page$pageprops$disambiguation), 'disambiguation', 'OK')
          }
          # Feed the output vectors
          normalized <- append(normalized, anorm)
          status     <- append(status, s)
          entity     <- append(entity, item)
          q$pages[[i]] <- NULL
          break
        }
      }
    }
    if (!is.null(j$continue)) {
      cat(paste("  INFO: continue response\n"), file = stderr())
      for (m in names(j$continue))
        query[[m]] <- j$continue[[m]]
    }
    else {
      output <- data.frame(original=titles, normalized=normalized,
                           status=status, entity=entity)    }
    return(output)
  }
}

# Get redirections of Wikipedia pages
m_Redirects <- function(titles, project="en.wikipedia.org") {
  if (!checkTitles(titles))
    return(NULL)
  #
  # Check the limit on number of titles of Wikidata API (50 titles)
  n <- length(titles)
  nlimit = 50
  if (n > nlimit) {
    cat(paste0("INFO: The number of titles (", n,") exceeds Wikipedia API limit (",
               nlimit,"): doing chunked queries.\n"),
        file = stderr())
    nlim <- as.integer(n/nlimit)
    for (k in 0:nlim) {
      offset <- nlimit * k
      if ((offset+1) > n)
        break
      t_list <- titles[(offset+1):min(n,offset+nlimit)]
      cat(paste0(" INFO: Getting redirects of entities from ", offset+1," to ", offset+length(t_list), '.\n'),
          file = stderr())
      aux <- m_Redirects(t_list, project)
      #
      if (k==0)
        output <- aux
      else
        output <- append(output, aux)
    }
    return(output)
  }
  #
  query = list(format        = 'json',
               formatversion = '2',
               action        = 'query',
               redirects     = '1',
               prop          = 'redirects',
               rdnamespace   = '0',             # Only from namespace = 0
               rdlimit       = 'max',           # max: Change to 10 for testing
               rdprop        = 'title',
               titles        = paste0(titles, collapse = '|'))
  # Get a list with list names but without list entries.
  output <- sapply(titles, function(x) NULL)
  #
  repeat {
    j <- reqMediaWiki(query, project)
    #
    if (is.null(j) || is.null(j$query)){
      cat("Error in m_Redirects: reqMediaWiki returns an improper JSON.",
          file = stderr())
      return(NULL)
    }
    #
    r <- j$query
    #
    for (a in titles) {
      anorm <- normalizedTitle(a, r)
      #  query is the original search query, before possible continue responses.
      # Any continue response includes the same "normalized" and "redirects" info
      if (is.null(query$continue))
        output[[a]] <- append(output[[a]], anorm)

      # With formatversion=2, j$json$query$redirects is a vector
      for (page in r$pages) {
        if (anorm == page$title) {
          if (!is.null(page$missing))   # missing, NULL
            output[[a]] <- NA
          if (!is.null(page$redirects))
            for (redirect in page$redirects)
              output[[a]] <- append(output[[a]], redirect$title)
        }
      }
    }
    if (!is.null(j$continue)) {
      cat(paste("  INFO: continue response\n"), file = stderr())
      for (m in names(j$continue))
        query[[m]] <- j$continue[[m]]
    }
    else
      return(output)
  }
}

# Get image URL of Wikipedia pages
m_PagePrimaryImage <- function(titles, project="en.wikipedia.org") {
  if (!checkTitles(titles))
    return(NULL)
  #
  # Check the limit on number of titles of Wikidata API (50 titles)
  n <- length(titles)
  nlimit = 50
  output <- NULL
  if (n > nlimit) {
    cat(paste0("INFO: The number of titles (", n,") exceeds Wikipedia API limit (",
               nlimit,"): doing chunked queries.\n"),
        file = stderr())
    nlim <- as.integer(n/nlimit)
    for (k in 0:nlim) {
      offset <- nlimit * k
      if ((offset+1) > n)
        break
      t_list <- titles[(offset+1):min(n,offset+nlimit)]
      cat(paste0(" INFO: Getting primary images of entities from ", offset+1," to ", offset+length(t_list), '.\n'),
          file = stderr())
      aux <- m_PagePrimaryImage(t_list, project)
      #
      if (k==0)
        output <- aux
      else
        output <- rbind(output, aux)
    }
    return(output)
  }
  #
  query = list(format        = 'json',
               formatversion = '2',
               action        = 'query',
               redirects     = '1',
               prop          = 'pageimages', # return URL and dimensions
               piprop        = 'original',
               pilimit       = 'max',        # max(=50): Change to 10 for testing
               titles        = paste0(titles, collapse = '|'))
  # vectors for loading the returned information
  normalized <- character()
  status     <- character()
  pageimage  <- character()
  #
  repeat {
    j <- reqMediaWiki(query, project)
    #
    if (is.null(j) || is.null(j$query)){
      cat("Error in m_PagePrimaryImage: reqMediaWiki returns an improper JSON.",
          file = stderr())
      return(NULL)
    }
    #
    q <- j$query
    #
    for (a in titles) {
      anorm <- normalizedTitle(a, q)
      # check if a normalized register is just loaded
      # If continue response, not overwrite previous search
      index <- match(anorm, normalized)
      if (!is.na(index)) {
        normalized <- append(normalized, anorm)
        status     <- append(status, status[index])
        pageimage  <- append(pageimage, pageimage[index])
        next
      }
      #
      # With formatversion=2, j$json$query$pages is a vector
      #for (page in q$pages) {
      for (i in 1:length(q$pages)) {
        page <- q$pages[[i]]
        if (anorm == page$title) {
          image <- NA
          if (!is.null(page$invalid))
            s <- 'invalid'
          else if (!is.null(page$missing))
            s <- 'missing'
          else if (is.null(page$pageid))
            s <- 'no pageid'
          else
            s <- 'OK'
          #
          if (!is.null(page$original) && !is.null(page$original$source))
            image <- page$original$source
          # Feed the output variables
          normalized <- append(normalized, anorm)
          status     <- append(status, s)
          pageimage  <- append(pageimage, image)
          q$pages[[i]] <- NULL
          break
        }
      }
    }
    #
    if (!is.null(j$continue)) {
      cat(paste("  INFO: continue response\n"), file = stderr())
      for (m in names(j$continue))
        query[[m]] <- j$continue[[m]]
    }
    else {
      output <- data.frame(original=titles, normalized=normalized,
                           status=status, pageimage=pageimage)
      return(output)
    }
  }
}

# Get URL of files in Wikipedia pages
m_PageFiles <- function(titles, project = "en.wikipedia.org",
                        exclude_ext = 'svg|webp|xcf') {
  if (!checkTitles(titles))
    return(NULL)
  #
  exts <- strsplit(tolower(exclude_ext), "|", fixed=T)[[1]]
  #
  # Check the limit on number of titles of Wikidata API (50 titles)
  n <- length(titles)
  nlimit = 50
  output <- NULL
  if (n > nlimit) {
    cat(paste0("INFO: The number of titles (", n,") exceeds Wikipedia API limit (",
               nlimit,"): doing chunked queries.\n"),
        file = stderr())
    nlim <- as.integer(n/nlimit)
    for (k in 0:nlim) {
      offset <- nlimit * k
      if ((offset+1) > n)
        break
      t_list <- titles[(offset+1):min(n,offset+nlimit)]
      cat(paste0(" INFO: Getting inserted files of entities from ", offset+1," to ", offset+length(t_list), '.\n'),
          file = stderr())
      aux <- m_PageFiles(t_list, project, exclude_ext)
      #
      if (k==0)
        output <- aux
      else
        output <- rbind(output, aux)
    }
    return(output)
  }
  #
  query = list(format        = 'json',
               formatversion = '2',
               action        = 'query',
               redirects     = '1',
               prop          = 'images',
               imlimit       = 'max',           # max(=50): Change to 10 for testing
               titles        = paste0(titles, collapse = '|'))
  # vectors for loading the return information
  normalized <- character()
  status     <- character()
  pageimages <- character()
  #
  repeat {
    j <- reqMediaWiki(query, project)
    #
    if (is.null(j) || is.null(j$query)){
      cat("Error in m_PageFiles: reqMediaWiki returns an improper JSON.",
          file = stderr())
      return(NULL)
    }
    #
    q <- j$query
    #
    for (a in titles) {
      anorm <- normalizedTitle(a, q)
      # check if a normalized register is just loaded
      # If continue response, not overwrite previous search
      index <- match(anorm, normalized)
      if (!is.na(index)) {
        normalized <- append(normalized, anorm)
        status     <- append(status, status[index])
        pageimages <- append(pageimages, pageimages[index])
        next
      }
      #
      # With formatversion=2, j$json$query$pages is a vector
      #for (page in q$pages) {
      for (i in 1:length(q$pages)) {
        page <- q$pages[[i]]
        images <- NA
        if (anorm == page$title) {
          if (!is.null(page$invalid))
            s <- 'invalid'
          else if (!is.null(page$missing))
            s <- 'missing'
          else if (is.null(page$pageid))
            s <- 'no pageid'
          else
            s <- 'OK'
          #
          if (!is.null(page$images)) {
            images <- character()
            for (im in page$images) {
              ext <-strsplit(im$title, ".", fixed = T)[[1]]
              ext <- tolower(ext[length(ext)])    # last is the extension
              if (ext %in% exts)
                next
              img <- paste0('https://', project, '/wiki/', im$title)
              img <- gsub(" ", "_", img)
              # concatenate to previous image
              images <- append(images, img)
            }
            if (length(images) == 0)  # some images may be in exclude_ext
              images <- NA
            else
              images <- paste0(images, collapse = '|')
          }
          # Feed the output variables
          normalized <- append(normalized, anorm)
          status     <- append(status, s)
          pageimages <- append(pageimages, images)
          q$pages[[i]] <- NULL
          break
        }
      }
    }
    #
    if (!is.null(j$continue)) {
      cat(paste("  INFO: continue response\n"), file = stderr())
      for (m in names(j$continue))
        query[[m]] <- j$continue[[m]]
    }
    else {
      output <- data.frame(original=titles, normalized=normalized,
                           status=status, pageimages=pageimages)
      return(output)
    }
  }
}


# WikiMedia API ----
# Used to obtain metrics of Wikipedia pages
# See https://www.mediawiki.org/wiki/Wikimedia_REST_API
# See https://en.wikipedia.org/api/rest_v1/
# See https://www.mediawiki.org/wiki/XTools/API/Page (xtools.wmcloud.org)
#
# Note that this API uses "article" to refer a page title. We also use this
# approach. In this API only one article is allowed in each request.
# -------------------------------------------------------------------------.

#' httrGetJSON
#' Retrieve responses in JSON format using httr::GET. It is a generic function
#' to use for request these Wikimedia metrics API:
#'   https://wikimedia.org/api/rest_v1/
#'   https://www.mediawiki.org/wiki/XTools/API/Page (xtools.wmflabs.org)
#' @param url The URL with the query to the API.
#' @return A JSON response. Please check httr::stop_for_status(response)
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @importFrom httr GET user_agent add_headers
#' @note Used in m_Pageviews
httrGetJSON <- function(url) {
  # URL must be encode
  url <- URLencode(url)
  if(curl::has_internet() && !httr::http_error(url)){
    httr::GET(
      url = url,
      httr::user_agent(user_agent),
      httr::add_headers(Accept = "application/json")
    )
  }else{
    message("No internet connection or data source broken.")
  }
}

#' Get number of views of a Wikipedia article
#'
#' Use the Wikimedia REST API (https://wikimedia.org/api/rest_v1/) to get the
#' number of views one article has in a Wikimedia project in a date interval
#' (see granularity). If redirect=TRUE, then get the number of views of all
#' articles that redirects to the article which is the destiny of actual page.
#' @param article The title of the article to search. Only one article is allowed.
#' @param start,end First and last day to include (format YYYYMMDD or YYYYMMDDHH)
#' @param project The Wikimedia project, defaults en.wikipedia.org
#' @param access  Filter by access method: all-access (default), desktop, mobile-app, mobile-web
#' @param agent   Filter by agent type: all-agents, user (default), spider, automated
#' @param granularity  Time unit for the response data: daily, monthly (default)
#' @param redirects Boolean to include the views of all redirections of the page
#' (defaults: False). If redirects=TRUE then the "normalized" element of the
#' returned vector contains the destiny of the redirection, and the "original"
#' element contains the original title of the article.
#' If a page is just a destiny of other pages, and you want to know the total
#' number of views that page have (including views of redirections), it is also
#' necessary set redirects=TRUE,  otherwise only you have the views of that page.
#' @return A vector with the number of visits by granularity.
#' @examples
#' v <-  m_Pageviews(article="Cervantes", start="20230101", end="20230501",
#'                    project="es.wikipedia.org", granularity="monthly")
#' vv <- m_Pageviews(article="Cervantes", start="20230101", end="20230501",
#'                    project="es.wikipedia.org", granularity="monthly",
#'                    redirects=TRUE)
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @importFrom httr stop_for_status content
#' @importFrom jsonlite fromJSON
#' @export
m_Pageviews <- function(article, start, end, project="en.wikipedia.org",
                        access="all-access", agent="user",
                        granularity="monthly", redirects=FALSE) {
  # httrGetJSON_rated: the limitated version of httrGetJSON.
  # Limit is 100 req/s (See https://wikimedia.org/api/rest_v1/#/Pageviews%20data)
  httrGetJSON_rated <- limitRequester(httrGetJSON, n=100, period=1)
  article <- gsub(" ", "_", article)
  url <- "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article"
  url <- paste(url, project, access, agent, article, granularity, start, end, sep="/", collapse = "")
  a <- list()
  a[['original']] <- article
  if (redirects) {
    arts <- m_Redirects(article, project)   # vector of lists
    arts <- arts[[article]]
    article <- arts[1]
    a[['normalized']] <- article
  }
  if (redirects == TRUE) {
    for (art in arts) {
      if (length(arts) > 1)
        cat(paste0(" INFO: Getting the number of views of '", art,"' page.\n"), file=stderr())
      b <- m_Pageviews(art, start, end, project, access, agent,
                       granularity, redirects=FALSE)
      for (n in union(names(a), names(b))) {
        if (!(n %in% c('original', 'normalized')))
          a[n] <- ifelse(n %in% names(a), a[n][[1]], 0) + ifelse(n %in% names(b), b[n][[1]], 0)
      }
    }
  }
  else {
    r <- httrGetJSON_rated(url)
    if(!is.null(r)){
      httr::stop_for_status(r)
      content <- httr::content(r, as = "text", encoding = "UTF-8")
      j <- jsonlite::fromJSON(content, simplifyVector = FALSE)
      for (v in j$items)
        a[v$timestamp] <- v$views
    }
  }
  return(a)
}


#' Gets various information from a Wikimedia page
#'
#' Obtains information in JSON format about an article in the Wikimedia project
#' or NULL on errors. Use the wmflabs API. The XTools Page API endpoints offer data
#' related to a single page. See https://www.mediawiki.org/wiki/XTools/API/Page.
#' The URL of the API starts with 'https://xtools.wmcloud.org/api/page/'
#' @param article The title of the article to search. Only one article is allowed.
#' @param project The Wikimedia project, defaults en.wikipedia.org.
#' @param infotype The type of information to request: articleinfo, prose, links.
#' You also can type 'all' to retrieve all.
#' Note that the API also offer theses options: top_editors, assessments,
#' bot_data and automated_edits.
#' @param redirects If redirects==TRUE, then the information is obtained
#' of the destiny of the page. In that case, then the "original" element of
#' the returned list contains the original page, and the "page" element the
#' destiny page. Also, if infotype=='links, the sum of the in-links of all
#' redirections is assigned to links_in_count.
#' @return A list with the information about the article.
#' @examples
#' \dontrun{
#' x <-  m_XtoolsInfo(article="Cervantes", infotype="articleinfo", project="es.wikipedia.org")
#' xx <- m_XtoolsInfo(article="Cervantes", infotype="articleinfo", project="es.wikipedia.org",
#'                    redirects=TRUE)
#'
#' y <-  m_XtoolsInfo(article="Miguel de Cervantes", infotype="links", project="es.wikipedia.org")
#' yy <- m_XtoolsInfo(article="Cervantes", infotype="links", project="es.wikipedia.org",
#'                     redirects=TRUE)

#' z  <- m_XtoolsInfo(article="Miguel de Cervantes", infotype="all", project="es.wikipedia.org")
#' zz <- m_XtoolsInfo(article="Cervantes", infotype="all", project="es.wikipedia.org",
#'                        redirects=TRUE)
#' }
#' @author Angel Zazo, Department of Computer Science and Automatics, University of Salamanca
#' @importFrom httr stop_for_status content
#' @importFrom jsonlite fromJSON
#' @export
m_XtoolsInfo <- function(article, infotype=c("articleinfo", "prose", "links"),
                         project="en.wikipedia.org", redirects=FALSE) {
  infotype <- infotype[1]
  if(infotype=="all"){
    return(m_XtoolsInfoAll(article,project,redirects))
  }else{
    return(m_XtoolsInfoDefault(article,infotype,project,redirects))
  }
}

# Gets various information from a Wikimedia page
m_XtoolsInfoDefault <- function(article, infotype=c("articleinfo", "prose", "links"),
                                project="en.wikipedia.org", redirects=FALSE){
  if(!(infotype %in% c("articleinfo", "prose", "links"))){
    stop("infotype: wrong infotype specified")
  }
  # httrGetJSON_rated: the limitratedr version of httrGetJSON.
  # Limit is 100 req/s.
  httrGetJSON_rated <- limitRequester(httrGetJSON, n=100, period=1)
  originalarticle <- article
  if (redirects) {
    arts <- m_Redirects(article, project)   # vector of lists
    arts <- arts[[article]]
    article <- arts[1]
    cat(paste0(" INFO: Getting information of the '", article,"' page.\n"), file=stderr())
  }
  #
  url <- 'https://xtools.wmcloud.org/api/page'
  url <- paste(url, infotype, project, article, sep="/", collapse = "")
  r <- httrGetJSON_rated(url)
  if(is.null(r)){
    return(NULL)
  }
  httr::stop_for_status(r)
  content <- httr::content(r, as = "text", encoding = "UTF-8")
  d <- jsonlite::fromJSON(content, simplifyVector = FALSE)
  #
  if (infotype == "links" && redirects) {
    l <- length(arts)
    for (i in 2:l) {
      cat(paste0("  INFO: Adding the number of links_in_count of the '", arts[i],"' page.\n"), file=stderr())
      b <- m_XtoolsInfo(arts[i], infotype="links", project=project,
                        redirects=FALSE)
      d["links_in_count"] <- d["links_in_count"][[1]] + b["links_in_count"][[1]]
      d["elapsed_time"] <- d["elapsed_time"][[1]] + b["elapsed_time"][[1]]
    }
  }
  if (redirects)
    d['original'] <- originalarticle
  return(d)
}

# Gets more diverse information from a Wikimedia page
m_XtoolsInfoAll <- function(article, project="en.wikipedia.org",
                            redirects=FALSE){
  # first: articleinfo
  r <- m_XtoolsInfo(article, infotype='articleinfo', project=project,
                    redirects=redirects)
  # Error in response
  if (!is.null(r$error)) {
    cat(paste0("Error in response from m_XtoolsInfo: ", r$error, '.\n'),
        file = stderr())
    return(NULL)
  }
  # Second: prose
  b <- m_XtoolsInfo(article, infotype="prose", project=project,
                    redirects=redirects)
  for (n in names(b))
    r[n] <- b[n]
  # Third: links
  c <- m_XtoolsInfo(article, infotype="links", project=project,
                    redirects=redirects)
  for (n in names(c))
    r[n] <- c[n]
  #
  return(r)
}


# VIAF API ----
# https://developer.api.oclc.org/viaf-api#/Authority%20Cluster
# --------------------------------------------------------------------------.

#' Suggests VIAF id from a name
#'
#' Search the name of the author from the VIAF AutoSuggest API and returns
#' information in JSON format of the records found. Note that only
#' returns a maximum of 10 records. Note that those records are not
#' VIAF cluster records.
#' A VIAF record is considered a "cluster record," which is the result of
#' combining records from many libraries around the world into a single record.
#' @param author String to search. Please, see the structure of the author
#' string to obtain better results:
#'   author: last name, first name\[,\] \[(\[year_of_bird\]\[-year_of_death\])\]
#' @return A data-frame with four columns from the elements "term", "score",
#' "nametype" and "viafid" of the Autosuggest API response.
#' @seealso https://developer.api.oclc.org/viaf-api#/Authority%20Cluster
#' @examples
#' v_AutoSuggest('Iranzo')
#' v_AutoSuggest('Esparza, María')
#' # Four rows, only two viafid:
#' v_AutoSuggest('Escobar, Modesto')
#' @export
v_AutoSuggest <- function(author) {
  url <- "https://www.viaf.org/viaf/AutoSuggest"
  query <- list(query = author)
  if(!curl::has_internet() || httr::http_error(url,query=query)){
    message("No internet connection or data source broken.")
    return(NULL)
  }
  #
  tryCatch(
    {
      r <- httr::GET(url = url,
                     httr::user_agent(user_agent),
                     query = query)
      httr::stop_for_status(r)
      content <- httr::content(r, as = "text", encoding = "UTF-8")
      d <- jsonlite::fromJSON(content, simplifyVector = FALSE)
      # Only return (as data-frame) this columns: "term", "score", "nametype", "viafid"
      output <- t(sapply(d$result, function(x) x[c("term", "score", "nametype", "viafid")]))
      return(output)
    }, error = function(e) {
      cat(as.character(e), file = stderr())
      return(NULL)
    }
  )
}

#' Run a CQL Query in VIAF
#'
#' Run the CQL_Query using the VIAF Search API and return a list of records
#' found. The search string is formed using the CQL_Query syntax of the API.
#' Note that returned records use the "info:srw/schema/1/JSON" record schema,
#' i.e., are complete cluster records packed in JSON format. If the number
#' of records found is greater than 250 (API restrictions), successive requests
#' are made.
#' @param CQL_Query String with the search or a name if mode is specified.
#' See https://developer.api.oclc.org/viaf-api#/Authority%20Cluster
#' @param mode apply a predefined query:
#' 'anyField' -> 'cql.any = "string"'
#' Search preferred Name - names which are the preferred form in an authority
#' record (1xx fields of the MARC records);
#' 'allmainHeadingEl' -> 'local.mainHeadingEl all "name"'
#' Search the same as previous, but all terms are searched;
#' 'allNames' -> 'local.names all "name"'
#' Search Names - any name preferred or alternate (1xx, 4xx, 5xx fields of the
#' MARC records);
#' 'allPersonalNames' -> 'local.personalNames all "name"'
#' Search Personal Names within the authority record (100, 400, 500 fields of
#' MARC records);
#' 'allTitle' -> 'local.title all "title"'
#' Search for titles.
#' By 'default', no predefined query will be applied.
#' @param schema The recordSchema of the query: if 'brief' (defaults) the
#' records returned are more simple. If 'JSON', then de complete cluster records
#' are returned.
#' @return A list with the records found.
#' @examples
#' \dontrun{
#' ## Search in any field (cql.any)
#' # Operator is "=": so search one or more terms:
#' CQL_Query <- 'cql.any = "García Iranzo, Juan"'
#' r <- v_Search(CQL_Query)
#' # r contains complete VIAF records (sometimes seen as a "cluster record",
#' # which is unified by combining records from many libraries around the world)

#' # Search in 1xx, 4xx, 5xx fields of MARC record (local.names)
#' # Operator is "all": search all terms
#' CQL_Query <- 'local.names all "Modesto Escobar"'
#' r <- v_Search(CQL_Query)
#'
#' # Search in 100, 400, 500 fields of MARC record (local.personalNames)
#' # Operator is "all": search all terms
#' CQL_Query <- 'local.personalNames all "Modesto Escobar"'
#' r <- v_Search(CQL_Query)
#'
#' # Search in Titles
#' CQL_Query <- 'local.title all "Los pronósticos electorales con encuestas"'
#' r <- v_Search(CQL_Query)
#' }
#' @export
v_Search <- function(CQL_Query, mode=c('default', 'anyField', 'allmainHeadingEl',
                                       'allNames', 'allPersonalNames', 'allTitle'),
                     schema=c('JSON', 'brief')) {
  mode <- mode[1]
  if(mode=='anyField'){
    CQL_Query <- paste0("cql.any = ", CQL_Query)
  }else if(mode=='allmainHeadingEl'){
    CQL_Query <- paste0("local.mainHeadingEl all ", CQL_Query)
  }else if(mode=='allNames'){
    CQL_Query <- paste0("local.names all ", CQL_Query)
  }else if(mode=='allPersonalNames'){
    CQL_Query <- paste0("local.personalNames all ", CQL_Query)
  }else if(mode=='allTitle'){
    CQL_Query <- paste0("local.title all ", CQL_Query)
  }

  schema <- schema[1]
  if(schema=='brief')
    recordSchema <- 'http://viaf.org/BriefVIAFCluster'
  else
    recordSchema <- 'info:srw/schema/1/JSON'

  searchFn <- function(cql_query){
    maxrecords <- 250
    url <- "https://www.viaf.org/viaf/search"
    query <- list(httpAccept = 'application/json',
                  maximumRecords = maxrecords,
                  recordSchema = recordSchema,
                  startRecord = 1,
                  query = cql_query)
    if(!curl::has_internet() || httr::http_error(url,query=query)){
      message("No internet connection or data source broken.")
      return(NULL)
    }
    #
    tryCatch(
      {
        output <- list()
        while(TRUE) {
          r <- httr::GET(url = url,
                         httr::user_agent(user_agent),
                         query = query)
          httr::stop_for_status(r)
          # cat(r$url)
          content <- httr::content(r, as = "text", encoding = "UTF-8")
          d <- jsonlite::fromJSON(content, simplifyVector = FALSE)
          nrecords <- d$searchRetrieveResponse$numberOfRecords
          records  <- d$searchRetrieveResponse$records
          output <- append(output, records)
          if (length(output) >= as.integer(nrecords)) {
            if (query$startRecord > 1)
              cat(paste0(' INFO: Retrieved ', length(output), ' records.\n'), file = stderr())
            return(output)
          }
          if (query$startRecord == 1)
            cat(paste0("INFO: Number of records found (", nrecords,
                       ") excedes the maximun per request API limit (", maxrecords,
                       "). Doing sucesivelly requests.\n"), file = stderr())
          cat(paste0(' INFO: Retrieved ', length(output), ' records.\n'), file = stderr())
          query$startRecord <- query$startRecord + maxrecords
        }
      }, error = function(e) {
        cat(as.character(e), file = stderr())
        return(NULL)
      }
    )
  }

  return(searchFn(CQL_Query))
}


#' Gets record clusters
#'
#' Obtains the record cluster identified by viafid from VIAF, in the format
#' indicated in record_format. Note that the returned record may be a VIAF
#' cluster record or a redirect/scavenged record: the function returns the
#' record as is.
#' @param viafid The VIAF identifier.
#' @param record_format 'viaf.json' (default) or others in
#' https://developer.api.oclc.org/viaf-api#/Authority%20Cluster
#' @return The VIAF record cluster in the format indicated in record_format.
#' @export
v_GetRecord <- function(viafid, record_format='viaf.json') {
  url <- paste0("https://viaf.org/viaf/", viafid, '/', record_format)
  if(!curl::has_internet() || httr::http_error(url)){
    message("No internet connection or data source broken.")
    return(NULL)
  }
  r <- httr::GET(
    url = url,
    httr::user_agent(user_agent)
  )
  #
  tryCatch(
    {
      r <- httr::GET(url = url,
                     httr::user_agent(user_agent))
      httr::stop_for_status(r)
      content <- httr::content(r, as = "text", encoding = "UTF-8")
      # JSON formats:
      if (grepl('json', record_format)) {
        d <- jsonlite::fromJSON(content, simplifyVector = FALSE)
        return(d)
      }
      # Other formats:
      return(content)
    }, error = function(e) {
      cat(as.character(e), file = stderr())
      return(NULL)
    }
  )
}

#' Gets information from a VIAF record
#'
#' Returns information from the VIAF record. Note that the VIAF record musts
#' be in JSON format.
#' @param viaf VIAF cluster record (in JSON format).
#' @param info is mandatory to select which information you want to retrieve.
#' The options are 'titles', 'gender', 'dates', 'occupations', 'sources',
#' 'sourceId' or 'wikipedias'.
#' @param source the identifier of the source (LC, WKP, JPG, BNE...)
#' Only if info=sourceId.
#' @return depends on the info selected:
#' 'titles' A list with titles;
#' 'gender' The gender of the author o NULL if not exits in the record;
#' 'dates' The bird year and death year in format byear:dyear;
#' 'occupations' A data-frame with sources and occupations from each source or NULL if
#' occupations do not exist in the record;
#' 'sources' A data-frame with text and sources;
#' 'sourceId' A data-frame with columns text and source, or NULL if the source does
#' no exist in the viaf record;
#' 'wikipedias' A vector with the URL of the Wikipedias.
#' @export
v_Extract <- function(viaf, info, source=NULL){
  opts <- c('titles', 'gender', 'dates', 'occupations', 'sources', 'sourceId', 'coauthors', 'wikipedias', 'allinfo')
  if(!(info %in% opts)){
    stop(paste0("info: must select an option between '",paste0(opts,collpase="', '"),"'"))
  }

  if(info=="sourceId"){
    if(is.null(source)){
      stop("source: the identifier of the source must be supplied (LC, WKP, JPG, BNE...)")
    }
    return(v_sourceId(viaf,source))
  }else{
    retrieveFn <- get(paste0('v_',info))
    return(retrieveFn(viaf))
  }
}

# Gets works titles from a VIAF record
v_titles <- function(viaf) {
  if (is.null(viaf$titles) || is.null(viaf$titles$work))
    return(NULL)
  if (!is.null(viaf$titles$work$title))
    return(viaf$titles$work$title)
  titles = character()
  for (w in viaf$titles$work) {
    titles <- append(titles, w$title)
  }
  return(titles)
}

# Gets author's gender from a VIAF record
v_gender <- function(viaf) {
  if (is.null(viaf$fixed))
    return(NULL)
  if (viaf$fixed$gender == 'a')
    return('female')
  else if (viaf$fixed$gender == 'b')
    return('male')
  else
    return(viaf$fixed$gender)
}

# Gets bird and death years from a VIAF record
v_dates <- function(viaf) {
  if (is.null(viaf$birthDate))
    byear <- ''
  else
    byear <- substr(viaf$birthDate, 1, 4)
  #
  if (is.null(viaf$deathDate))
    dyear <- ''
  else
    dyear <- substr(viaf$deathDate, 1, 4)
  #
  return(paste0(byear,":", dyear))
}

# Gets occupations from a VIAF record
v_occupations <- function(viaf) {
  if (is.null(viaf$occupation))
    return(NULL)
  #
  dd <- viaf$occupation$data
  if (!is.null(dd$text))
    dd <- list(dd)
  # df1 is the returned data-frame.
  df1 = data.frame()
  for (l in dd) {
    occ <- list()
    for (s in l$sources$s) {
      occ[[s]] <- l$text
    }
    df2 <- data.frame(occ)
    if (nrow(df1) == 0)
      df1 <- df2
    else
      # https://stackoverflow.com/questions/3402371/combine-two-data-frames-by-rows-rbind-when-they-have-different-sets-of-columns
      df1 <- rbind(
        data.frame(c(df1, sapply(setdiff(names(df2), names(df1)), function(x) NA))),
        data.frame(c(df2, sapply(setdiff(names(df1), names(df2)), function(x) NA)))
      )
  }
  return(df1)
}

# Gets coauthors from a VIAF record
v_coauthors <- function(viaf) {
  if (is.null(viaf$coauthors))
    return(NULL)
  #
  dd <- viaf$coauthors$data
  if (!is.null(dd$text))
    dd <- list(dd)
  # df1 is the returned data-frame.
  df1 = data.frame()
  coa <- list()
  for (l in dd) {
    coa <- list()
    coa$coauthor <- l$text
    coa$count    <- l[['@count']]
    df2 <- data.frame(coa)

    ## Unify colname of first column
    if (nrow(df1) == 0)
      df1 <- df2
    else
      # https://stackoverflow.com/questions/3402371/combine-two-data-frames-by-rows-rbind-when-they-have-different-sets-of-columns
      df1 <- rbind(
        data.frame(c(df1, sapply(setdiff(names(df2), names(df1)), function(x) NA))),
        data.frame(c(df2, sapply(setdiff(names(df1), names(df2)), function(x) NA)))
      )
    #
  }
  return(df1)
}

# Gets text of all sources from a VIAF record
v_sources <- function(viaf) {
  vv <- viaf$mainHeadings$data
  if (!is.null(vv$text))
    vv <- list(vv)
  # df1 is the returned data-frame.
  df1 = data.frame()
  for (l in vv) {
    sids <- list()
    for (sid in l$sources$sid) {
      s <- strsplit(sid, '|', fixed = TRUE)[[1]]
      sids[[s[1]]] <- s[2]
    }
    # texts <- rbind(texts, c(l$text, paste(l$sources$sid, collapse = ';')))
    # texts <- rbind(texts, c(l$text, sids))
    df2 <- data.frame(c(l$text, sids))
    ## Unify colname of first column
    colnames(df2)[1] <- c('text')
    if (nrow(df1) == 0)
      df1 <- df2
    else
      # https://stackoverflow.com/questions/3402371/combine-two-data-frames-by-rows-rbind-when-they-have-different-sets-of-columns
      df1 <- rbind(
        data.frame(c(df1, sapply(setdiff(names(df2), names(df1)), function(x) NA))),
        data.frame(c(df2, sapply(setdiff(names(df1), names(df2)), function(x) NA)))
      )
  }
  return(df1)
}

# Gets text and identifier of the source from a VIAF record
v_sourceId <- function(viaf, source) {
  texts <- v_sources(viaf)
  if (is.null(texts[[source]]))
    return(NULL)
  else
    return(texts[!is.na(texts[[source]]), c('text', source)])
}

# Gets Wikipedia pages (URL) from a VIAF record
v_wikipedias <- function(viaf) {
  if (is.null(viaf$xLinks) || is.null(viaf$xLinks$xLink))
    return(NULL)
  #
  wikis <- character()
  xLinks <- viaf$xLinks$xLink
  if (!is.null(xLinks$`#text`)) {
    url <- xLinks[['#text']]
    if (grepl('https?://[^.]+.wikipedia.org', url))
      wikis <- append(wikis, url)
  }
  else {
    for (l in xLinks) {
      url <- l[['#text']]
      if (grepl('https?://[^.]+.wikipedia.org', url))
        wikis <- append(wikis, url)
    }
  }
  if (length(wikis) == 0)
    return(NULL)
  return(wikis)
}

# Gets all in one
v_allinfo <- function(viaf) {
  l <- list()
  l$vid <- viaf$viafID
  l$gender <- v_gender(viaf)
  l$dates  <- v_dates(viaf)
  l$sources <- v_sources(viaf)
  l$titles <- v_titles(viaf)
  l$occupations <- v_occupations(viaf)
  l$coauthors <- v_coauthors(viaf)
  l$wikipedias <- v_wikipedias(viaf)
  return(l)
}

Try the wikiTools package in your browser

Any scripts or data that you put into this service are public.

wikiTools documentation built on April 3, 2025, 9:38 p.m.