knitr::opts_chunk$set(collapse = TRUE)

roto.athena

Perform and Manage 'Amazon' 'Athena' Queries

Description

This is a 'reticulated' wrapper for the 'Python' 'boto3' 'AWS' 'Athena' client library https://boto3.readthedocs.io/en/latest/reference/services/athena.html. It requires 'Python' version 3.5+ and an 'AWS' account. Tools are also provided to execute 'dplyr' chains asynchronously.

NOTE

This package requires Python >= 3.5 to be available and the boto3 Python module. The package author highly recommends setting RETICULATE_PYTHON=/usr/local/bin/python3 in your ~/.Renviron file to ensure R + reticulate will use the proper python version.

What's Inside The Tin

The following functions are implemented:

Installation

devtools::install_github("hrbrmstr/roto.athena")
# OR
devtools::install_git("git://gitlab.com/hrbrmstr/roto.athena")
options(width=120)

Usage

library(roto.athena)
library(tidyverse)

# current verison
packageVersion("aws.athena")

Basic Usage

# see recent queries
x <- list_query_executions(profile = "personal")

head(x$QueryExecutionIds)

# get last 5 query executions
y <- get_query_executions(x$QueryExecutionIds[1:5], profile = "personal")

# only look at the ones that succeeded
filter(y$QueryExecutions, state == "SUCCEEDED") 

# fire off another one!
start_query_execution(
  query = "SELECT * FROM elb_logs LIMIT 100",
  database = "sampledb",
  output_location = "s3://aws-athena-query-results-redacted",
  profile = "personal"
) -> sqe
Sys.sleep(2)
# see the status
get_query_execution(sqe, profile = "personal") %>% 
  glimpse()

# get the results
res <- get_query_results(sqe, profile = "personal")

res

Async dplyr calls

library(odbc)
library(DBI)

DBI::dbConnect(
  odbc::odbc(), 
  driver = "/Library/simba/athenaodbc/lib/libathenaodbc_sbu.dylib", 
  Schema = "sampledb",
  AwsRegion = "us-east-1",
  AwsProfile = "personal",
  AuthenticationType = "IAM Profile",
  S3OutputLocation = "s3://aws-athena-query-results-redacted"
) -> con

elb_logs <- tbl(con, "elb_logs")

mutate(elb_logs, tsday = substr(timestamp, 1, 10)) %>% 
  filter(tsday == "2014-09-29") %>%
  select(requestip, requestprocessingtime) %>% 
  collect_async(
    database = "sampledb", 
    output_location = "s3://aws-athena-query-results-redacted",
    profile_name = "personal"
  ) -> id
Sys.sleep(2)
get_query_execution(id, profile = "personal") %>% 
  glimpse()

get_query_results(id, profile = "personal")


hrbrmstr/roto.athena documentation built on May 29, 2019, 11:42 a.m.