library("elastic")
The main interface to searching documents in your Elasticsearch store is the function Search()
. I nearly always develop R software using all lowercase, but R has a function called search()
, and I wanted to avoid collision with that function.
Search()
is an interface to both the HTTP search API (in which queries are passed in the URI of the request, meaning queries have to be relatively simple), as well as the POST API, or the Query DSL, in which queries are passed in the body of the request (so can be much more complex).
There are a huge amount of ways you can search Elasticsearch documents - this tutorial covers some of them, and highlights the ways in which you interact with the R outputs.
x <- connect()
out <- Search(x, index="shakespeare") out$hits$total
#> $value #> [1] 5000 #> #> $relation #> [1] "eq"
out$hits$hits[[1]]
#> $`_index` #> [1] "shakespeare" #> #> $`_type` #> [1] "_doc" #> #> $`_id` #> [1] "0" #> #> $`_score` #> [1] 1 #> #> $`_source` #> $`_source`$line_id #> [1] 1 #> #> $`_source`$play_name #> [1] "Henry IV" #> #> $`_source`$line_number #> [1] "" #> #> $`_source`$speaker #> [1] "" #> #> $`_source`$text_entry #> [1] "ACT I"
Search(x, index = "shakespeare")$hits$hits[[1]]
#> $`_index` #> [1] "shakespeare" #> #> $`_type` #> [1] "_doc" #> #> $`_id` #> [1] "0" #> #> $`_score` #> [1] 1 #> #> $`_source` #> $`_source`$line_id #> [1] 1 #> #> $`_source`$play_name #> [1] "Henry IV" #> #> $`_source`$line_number #> [1] "" #> #> $`_source`$speaker #> [1] "" #> #> $`_source`$text_entry #> [1] "ACT I"
Search(x, index = "shakespeare", body = '{ "_source": ["play_name", "speaker"] }')$hits$hits[[1]]
#> $`_index` #> [1] "shakespeare" #> #> $`_type` #> [1] "_doc" #> #> $`_id` #> [1] "0" #> #> $`_score` #> [1] 1 #> #> $`_source` #> $`_source`$play_name #> [1] "Henry IV" #> #> $`_source`$speaker #> [1] ""
Search(x, index="shakespeare", size=1, from=1)$hits
#> $total #> $total$value #> [1] 5000 #> #> $total$relation #> [1] "eq" #> #> #> $max_score #> [1] 1 #> #> $hits #> $hits[[1]] #> $hits[[1]]$`_index` #> [1] "shakespeare" #> #> $hits[[1]]$`_type` #> [1] "_doc" #> #> $hits[[1]]$`_id` #> [1] "1" #> #> $hits[[1]]$`_score` #> [1] 1 #> #> $hits[[1]]$`_source` #> $hits[[1]]$`_source`$line_id #> [1] 2 #> #> $hits[[1]]$`_source`$play_name #> [1] "Henry IV" #> #> $hits[[1]]$`_source`$line_number #> [1] "" #> #> $hits[[1]]$`_source`$speaker #> [1] "" #> #> $hits[[1]]$`_source`$text_entry #> [1] "SCENE I. London. The palace."
Using the q
parameter you can pass in a query, which gets passed in the URI of the query. This type of query is less powerful than the below query passed in the body of the request, using the body
parameter.
Search(x, index="shakespeare", q="speaker:KING HENRY IV")$hits$total
#> $value #> [1] 5000 #> #> $relation #> [1] "eq"
Here, query for values from 10 to 20 in the field line_id
Search(x, index="shakespeare", q="line_id:[10 TO 20]")$hits$total
#> $value #> [1] 11 #> #> $relation #> [1] "eq"
Version number usually is not returned.
sapply(Search(x, index="shakespeare", version=TRUE, size=2)$hits$hits, "[[", "_version")
#> [1] 1 1
Search(x, index="shakespeare", raw=TRUE)
#> [1] "{\"took\":0,\"timed_out\":false,\"_shards\":{\"total\":1,\"successful\":1,\"skipped\":0,\"failed\":0},\"hits\":{\"total\":{\"value\":5000,\"relation\":\"eq\"},\"max_score\":1.0,\"hits\":[{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"0\",\"_score\":1.0,\"_source\":{\"line_id\":1,\"play_name\":\"Henry IV\",\"line_number\":\"\",\"speaker\":\"\",\"text_entry\":\"ACT I\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"1\",\"_score\":1.0,\"_source\":{\"line_id\":2,\"play_name\":\"Henry IV\",\"line_number\":\"\",\"speaker\":\"\",\"text_entry\":\"SCENE I. London. The palace.\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"2\",\"_score\":1.0,\"_source\":{\"line_id\":3,\"play_name\":\"Henry IV\",\"line_number\":\"\",\"speaker\":\"\",\"text_entry\":\"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"3\",\"_score\":1.0,\"_source\":{\"line_id\":4,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.1\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"So shaken as we are, so wan with care,\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"4\",\"_score\":1.0,\"_source\":{\"line_id\":5,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.2\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"Find we a time for frighted peace to pant,\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"5\",\"_score\":1.0,\"_source\":{\"line_id\":6,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.3\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"And breathe short-winded accents of new broils\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"6\",\"_score\":1.0,\"_source\":{\"line_id\":7,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.4\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"To be commenced in strands afar remote.\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"7\",\"_score\":1.0,\"_source\":{\"line_id\":8,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.5\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"No more the thirsty entrance of this soil\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"8\",\"_score\":1.0,\"_source\":{\"line_id\":9,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.6\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"Shall daub her lips with her own childrens blood;\"}},{\"_index\":\"shakespeare\",\"_type\":\"_doc\",\"_id\":\"9\",\"_score\":1.0,\"_source\":{\"line_id\":10,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.7\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"Nor more shall trenching war channel her fields,\"}}]}}"
Common options are verbose=TRUE
, timeout_ms=1
, followlocation=TRUE
.
out <- Search(x, index="shakespeare", verbose = TRUE)
Pass in as an R list
mapping_create(x, "shakespeare", update_all_types = TRUE, body = '{ "properties": { "text_entry": { "type": "text", "fielddata": true } } }')
#> $acknowledged #> [1] TRUE
aggs <- list(aggs = list(stats = list(terms = list(field = "text_entry")))) Search(x, index="shakespeare", body=aggs)$hits$hits[[1]]
#> $`_index` #> [1] "shakespeare" #> #> $`_type` #> [1] "_doc" #> #> $`_id` #> [1] "0" #> #> $`_score` #> [1] 1 #> #> $`_source` #> $`_source`$line_id #> [1] 1 #> #> $`_source`$play_name #> [1] "Henry IV" #> #> $`_source`$line_number #> [1] "" #> #> $`_source`$speaker #> [1] "" #> #> $`_source`$text_entry #> [1] "ACT I"
Or pass in as json query with newlines, easy to read
aggs <- '{ "aggs": { "stats" : { "terms" : { "field" : "text_entry" } } } }' Search(x, index="shakespeare", body=aggs)$hits$hits[[1]]
#> $`_index` #> [1] "shakespeare" #> #> $`_type` #> [1] "_doc" #> #> $`_id` #> [1] "0" #> #> $`_score` #> [1] 1 #> #> $`_source` #> $`_source`$line_id #> [1] 1 #> #> $`_source`$play_name #> [1] "Henry IV" #> #> $`_source`$line_number #> [1] "" #> #> $`_source`$speaker #> [1] "" #> #> $`_source`$text_entry #> [1] "ACT I"
Or pass in collapsed json string
aggs <- '{"aggs":{"stats":{"terms":{"field":"text_entry"}}}}' Search(x, index="shakespeare", body=aggs)$hits$hits[[1]]
#> $`_index` #> [1] "shakespeare" #> #> $`_type` #> [1] "_doc" #> #> $`_id` #> [1] "0" #> #> $`_score` #> [1] 1 #> #> $`_source` #> $`_source`$line_id #> [1] 1 #> #> $`_source`$play_name #> [1] "Henry IV" #> #> $`_source`$line_number #> [1] "" #> #> $`_source`$speaker #> [1] "" #> #> $`_source`$text_entry #> [1] "ACT I"
Histograms
aggs <- '{ "aggs": { "latbuckets" : { "histogram" : { "field" : "decimalLatitude", "interval" : 5 } } } }' Search(x, index="gbif", body=aggs, size=0)$aggregations$latbuckets$buckets[1:3]
#> [[1]] #> [[1]]$key #> [1] -35 #> #> [[1]]$doc_count #> [1] 1 #> #> #> [[2]] #> [[2]]$key #> [1] -30 #> #> [[2]]$doc_count #> [1] 0 #> #> #> [[3]] #> [[3]]$key #> [1] -25 #> #> [[3]]$doc_count #> [1] 0
mmatch <- '{ "query": { "bool" : { "must_not" : { "range" : { "speech_number" : { "from" : 1, "to": 5 }}}}}}' sapply(Search(x, index="shakespeare", body=mmatch)$hits$hits, function(x) x$`_source`$speech_number)
#> [[1]] #> NULL #> #> [[2]] #> NULL #> #> [[3]] #> NULL #> #> [[4]] #> [1] 6 #> #> [[5]] #> [1] 6 #> #> [[6]] #> [1] 7 #> #> [[7]] #> [1] 7 #> #> [[8]] #> [1] 7 #> #> [[9]] #> [1] 7 #> #> [[10]] #> [1] 7
Fuzzy query on numerics
fuzzy <- list(query = list(fuzzy = list(text_entry = "arms"))) Search(x, index="shakespeare", body = fuzzy)$hits$total
#> $value #> [1] 49 #> #> $relation #> [1] "eq"
fuzzy <- list(query = list(fuzzy = list(text_entry = list(value = "arms", fuzziness = 4)))) Search(x, index="shakespeare", body=fuzzy)$hits$total
#> $value #> [1] 618 #> #> $relation #> [1] "eq"
With numeric
body <- list(query=list(range=list(decimalLongitude=list(gte=1, lte=3)))) Search(x, 'gbif', body=body)$hits$total
#> $value #> [1] 24 #> #> $relation #> [1] "eq"
body <- list(query=list(range=list(decimalLongitude=list(gte=2.9, lte=10)))) Search(x, 'gbif', body=body)$hits$total
#> $value #> [1] 126 #> #> $relation #> [1] "eq"
With dates
body <- list(query=list(range=list(eventDate=list(gte="2012-01-01", lte="now")))) Search(x, 'gbif', body=body)$hits$total
#> $value #> [1] 301 #> #> $relation #> [1] "eq"
body <- list(query=list(range=list(eventDate=list(gte="2014-01-01", lte="now")))) Search(x, 'gbif', body=body)$hits$total
#> $value #> [1] 292 #> #> $relation #> [1] "eq"
body <- '{ "query": { "more_like_this": { "fields": ["abstract","title"], "like": "and then", "min_term_freq": 1, "max_query_terms": 12 } } }' Search(x, 'plos', body=body)$hits$total
#> $value #> [1] 488 #> #> $relation #> [1] "eq"
body <- '{ "query": { "more_like_this": { "fields": ["abstract","title"], "like": "cell", "min_term_freq": 1, "max_query_terms": 12 } } }' Search(x, 'plos', body=body)$hits$total
#> $value #> [1] 58 #> #> $relation #> [1] "eq"
body <- '{ "query": { "query_string": { "query" : "cell" } }, "highlight": { "fields": { "title": {"number_of_fragments": 2} } } }' out <- Search(x, 'plos', body=body) out$hits$total
#> $value #> [1] 58 #> #> $relation #> [1] "eq"
sapply(out$hits$hits, function(x) x$highlight$title[[1]])[8:10]
#> [1] "Functional Analysis of the Drosophila Embryonic Germ <em>Cell</em> Transcriptome by RNA Interference" #> [2] "Diversin Is Overexpressed in Breast Cancer and Accelerates <em>Cell</em> Proliferation and Invasion" #> [3] "c-FLIP Protects Eosinophils from TNF-α-Mediated <em>Cell</em> Death In Vivo"
Search(x, 'shakespeare', q="a*")$hits$total
#> $value #> [1] 2747 #> #> $relation #> [1] "eq"
res <- Search(x, index = 'shakespeare', q="a*", time_scroll = "1m") length(scroll(x, res$`_scroll_id`, time_scroll = "1m")$hits$hits)
#> [1] 10
res <- Search(x, index = 'shakespeare', q = "a*", time_scroll = "5m") out <- res$hits$hits hits <- 1 while (hits != 0) { res <- scroll(x, res$`_scroll_id`) hits <- length(res$hits$hits) if (hits > 0) out <- c(out, res$hits$hits) } length(out)
#> [1] 2747
res$hits$total
#> $value #> [1] 2747 #> #> $relation #> [1] "eq"
Woohoo! Collected all 2747 documents in very little time.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.