In SymbolixAU/mongolite_test: Fast and Simple MongoDB Client for R

On most platforms, starting the MongoDB server is as easy as:

mongod

If this is the first time you run MongoDB, you need to create a directory /data/db and make it writable (use C:/data/db on Windows).

Hello World

# Init connection to local mongod
library(mongolite)
m <- mongo(collection = "diamonds")

# Insert test data
data(diamonds, package="ggplot2")
m$insert(diamonds)

# Check records
m$count()
nrow(diamonds)

# Perform a query and retrieve data
out <- m$find('{"cut" : "Premium", "price" : { "$lt" : 1000 } }')

# Compare
nrow(out)
nrow(subset(diamonds, cut == "Premium" & price < 1000))

# Cross-table
tbl <- m$mapreduce(
  map = "function(){emit({cut:this.cut, color:this.color}, 1)}",
  reduce = "function(id, counts){return Array.sum(counts)}"
)
# Same as:
data.frame(with(diamonds, table(cut, color)))

# Stream jsonlines into a connection
tmp <- tempfile()
m$export(file(tmp))

# Stream it back in R
library(jsonlite)
mydata <- stream_in(file(tmp))

# Or into mongo
m2 <- mongo("diamonds2")
m2$count()
m2$import(file(tmp))
m2$count()

# Remove the collection
m$drop()
m2$drop()

Flights

Some example queries from the dplyr tutorials.

# Insert some data
data(flights, package = "nycflights13")
m <- mongo(collection = "nycflights")
m$insert(flights)

# Basic queries
m$count('{"month":1, "day":1}')
jan1 <- m$find('{"month":1, "day":1}')

# Sorting
jan1 <- m$find('{"month":1,"day":1}', sort='{"distance":-1}')
head(jan1)

# Sorting on large data requires index
m$index(add = "distance")
allflights <- m$find(sort='{"distance":-1}')

# Select columns
jan1 <- m$find('{"month":1,"day":1}', fields = '{"_id":0, "distance":1, "carrier":1}')

# List unique values
m$distinct("carrier")
m$distinct("carrier", '{"distance":{"$gt":3000}}')

# Tabulate
m$aggregate('[{"$group":{"_id":"$carrier", "count": {"$sum":1}, "average":{"$avg":"$distance"}}}]')

# Map-reduce (binning)
hist <- m$mapreduce(
  map = "function(){emit(Math.floor(this.distance/100)*100, 1)}", 
  reduce = "function(id, counts){return Array.sum(counts)}"
)

# Dump to bson
dump <- tempfile()
m$export(file(dump), bson = TRUE)

# Remove the collection
m$drop()

# Restore
m$count()
m$import(file(dump), bson = TRUE)
m$count()

Combine with jsonlite

Example data with zipcodes from mongolite tutorial. This dataset has an _id column so you cannot insert it more than once.

library(jsonlite)
library(mongolite)

# Stream from url into mongo
m <- mongo("zips", verbose = FALSE)
stream_in(url("http://media.mongodb.org/zips.json"), handler = function(df){
  m$insert(df)
})

# Check count
m$count()

# Import. Note the 'location' column is actually an array!
zips <- m$find()
m$drop()

Nested data

Stream large bulk samples from openweathermap with deeply nested data (takes a while).

m <- mongo("weather", verbose = FALSE)
stream_in(gzcon(url("http://bulk.openweathermap.org/sample/daily_14.json.gz")), handler = function(df){
  m$insert(df)  
}, pagesize = 50)

berlin <- m$find('{"city.name" : "Berlin"}')
print(berlin$data)