Nothing
#' Get Google Analytics 360 BigQuery data
#'
#' Turn a google_analytics style call into BigQuery SQL.
#' Used with Google Analytics 360 BigQuery exports.
#'
#' All data will be unsampled, and requests will cost money against your BigQuery quota.
#'
#' Requires installation of bigQueryR and authentication under ga_bq_auth()
#' or googleAuthR::gar_auth() with BigQuery scope set.
#' View your projectIds upon authentication via [bigQueryR::bqr_list_projects()]
#'
#' No segments for now.
#'
#' Goals are not specified in BQ exports, so you need to look at how you define them and replicate per view
#' e.g. unique pageviews or unique events.
#'
#' Custom dimensions can be specified as session or hit level, so ignoring the setting in GA interface.
#'
#' You can get a sample Google Analytics dataset in bigquery by following the instructions
#' here: <https://support.google.com/analytics/answer/3416091?hl=en>
#'
#'
#' @param projectId The Google project Id where the BigQuery exports sit
#' @param datasetId DatasetId of GA export. This should match the GA View ID
#' @param start start date
#' @param end end date
#' @param metrics metrics to query
#' @param dimensions dimensions to query
#' @param sort metric to sort by
#' @param filters filter results
#' @param max_results How many results to fetch
#' @param query If query is non-NULL then it will use that and ignore above
#' @param return_query_only Only return the constructed query, don't call BigQuery
#' @param bucket if over 100000 results, specify a Google Cloud bucket to send data to
#' @param download_file Where to save asynch files. If NULL saves to current working directory.
#'
#' @return data.frame of results
#'
#' @seealso <https://support.google.com/analytics/answer/4419694?hl=en>
#' <https://support.google.com/analytics/answer/3437719?hl=en>
#'
#' @export
google_analytics_bq <- function(projectId,
datasetId,
start=NULL,
end=NULL,
metrics = NULL,
dimensions=NULL,
sort=NULL,
filters=NULL,
# segment=NULL,
max_results=100,
query=NULL,
return_query_only=FALSE,
bucket = NULL,
download_file = NULL){
if (is.null(dimensions)) {
stop("At least one dimension, such as date, is required to group the metric(s) by.",
call. = FALSE)
}
projectId <- as.character(projectId)
datasetId <- as.character(datasetId)
start <- if(!is.null(start)) as.character(as.Date(start))
end <- if(!is.null(end)) as.character(as.Date(end))
max_results <- as.integer(max_results)
if ((sum(metrics %in% names(lookup_bq_query_m)) == 0) & (is.null(metrics)==FALSE)) {
stop(sprintf("You have entered an invalid metric name. Here are all the possible metrics currently available: %s",
toString(names(lookup_bq_query_m))),
call. = FALSE)
}
if ((sum(dimensions %in% names(lookup_bq_query_d)) == 0) & (is.null(dimensions)==FALSE)) {
stop(sprintf("You have entered an invalid dimension name. Here are all the possible dimensions currently available: %s",
toString(names(lookup_bq_query_d))),
call. = FALSE)
}
check_packages_installed("bigQueryR")
## if Sys.Date() == end then construct for ga_sessions_intradata_ too.
if(is.null(query)){
if(as.Date(end) == Sys.Date()){
from_q <- sprintf("FROM (TABLE_DATE_RANGE([%s.ga_sessions_], TIMESTAMP('%s'), DATE_ADD(CURRENT_TIMESTAMP(), -1, 'DAY'))), (TABLE_DATE_RANGE([%s.ga_sessions_intraday_], DATE_ADD(CURRENT_TIMESTAMP(), -1, 'DAY'), CURRENT_TIMESTAMP()))",
datasetId, start, datasetId)
} else {
from_q <- sprintf("FROM (TABLE_DATE_RANGE([%s.ga_sessions_], TIMESTAMP('%s'), TIMESTAMP('%s')))",
datasetId, start, end)
}
if(!is.null(dimensions)){
## add the 200 custom dims too
lookup_bq_query_d <- c(lookup_bq_query_d, customDimensionMaker())
dims <- paste(lookup_bq_query_d[dimensions], collapse = ", ", sep = ", ")
group_q <- paste("GROUP BY", paste(dimensions, collapse = ", "))
} else {
dims <- NULL
group_q <- NULL
}
lookup_bq_query_m <- c(lookup_bq_query_m, customMetricMaker())
mets <- paste(lookup_bq_query_m[metrics], collapse = ", ", sep = ", ")
select_q <- paste("SELECT", paste(dims, mets, sep = ", "))
if(!is.null(sort)){
order_q <- paste("ORDER BY", paste(sort, collapse = ", "), "DESC")
} else {
order_q <- NULL
}
limit_q <- paste("LIMIT", as.character(max_results))
query <- paste(select_q, from_q, group_q, order_q, limit_q)
if(return_query_only){
return(query)
} else {
myMessage("Query: ", query, level = 3)
}
}
if(max_results < 100000){
out <- bigQueryR::bqr_query(projectId, datasetId, query)
} else {
## do an async query
if (!requireNamespace("googleCloudStorageR", quietly = TRUE)) {
stop("googleCloudStorageR needed for large extracts. Please install it via install.packages('googleCloudStorageR')",
call. = FALSE)
}
myMessage("Over 100,000 rows to fetch, creating asynchronous query via Google Cloud Storage.", level = 2)
google_analytics_bq_asynch(projectId = projectId,
datasetId = datasetId,
query = query,
bucket = bucket,
download_file = download_file)
return()
}
## convert to more R like objects if we can
if("hitTimestamp" %in% names(out)){
out$hitTimestamp <- as.POSIXct(out$hitTimestamp, origin = "1970-01-01")
}
if("date" %in% names(out)){
out$date <- as.Date(out$date, format = "%Y%m%d")
}
if("visitStartTime" %in% names(out)){
out$visitStartTime <- as.POSIXct(out$visitStartTime, origin = "1970-01-01")
}
out
}
#' Asynch fetch
#' @keywords internal
google_analytics_bq_asynch <- function(projectId,
datasetId,
query,
bucket,
download_file){
if (!requireNamespace("bigQueryR", quietly = TRUE)) {
stop("bigQueryR needed for this function to work. Please install it via install.packages('bigQueryR')",
call. = FALSE)
}
if (!requireNamespace("googleCloudStorageR", quietly = TRUE)) {
stop("googleCloudStorageR needed for this function to work. Please install it via install.packages('googleCloudStorageR')",
call. = FALSE)
}
time0 <- Sys.time()
required_scopes <- c("https://www.googleapis.com/auth/devstorage.full_control",
"https://www.googleapis.com/auth/cloud-platform")
if(!any(getOption("googleAuthR.scopes.selected") %in% required_scopes)){
stop("Need re-authentication with googleAuthR::gar_auth() with one of scopes:",
paste(required_scopes, collapse = " "))
}
if(is.null(bucket)){
## maybe call googleCloudStorgeR to create bucket?
stop("Need a Google Cloud bucket to send data to. Please create one at https://cloud.google.com/storage/")
}
tableId <- paste0("googleAnalyticsRjob_",
gsub("-|:| ","",as.character(Sys.time())),
"_",
idempotency())
query_job <- bigQueryR::bqr_query_asynch(projectId = projectId,
datasetId = datasetId,
query = query,
destinationTableId = tableId)
query_job <- bigQueryR::bqr_wait_for_job(query_job)
myMessage("\nBigQuery query successful and now in BigQuery tableId: ", tableId,
"\n - now extracting data to Cloud Storage bucket", bucket, level = 3)
extract_job <- bigQueryR::bqr_extract_data(projectId = projectId,
datasetId = datasetId,
tableId = tableId,
cloudStorageBucket = bucket)
extract_job <- bigQueryR::bqr_wait_for_job(extract_job)
myMessage("\nBigQuery extract successful to ", bucket,
" - now downloading data from Google Cloud Storage", level = 3)
bigQueryR::bqr_download_extract(extract_job,
filename = download_file)
myMessage("All finished, total job time:", format(difftime(Sys.time(), time0), format = "%H:%M:%S"), level = 3)
}
## this is hit level: add session and product level too.
customDimensionMaker <- function(customDimensionIndex=paste0("dimension",1:200)){
assertthat::assert_that(is.character(customDimensionIndex))
indexes <- grep("^dimension(.+)", customDimensionIndex)
if(length(indexes) < 1) stop("Custom dimension specified but no custom dimensions found")
dimensionXX = "MAX(IF (hits.customDimensions.index = XX, hits.customDimensions.value, NULL)) WITHIN RECORD AS dimensionXX"
out <- vapply(indexes, function(i) gsub("XX", i, dimensionXX), character(1))
names(out) <- customDimensionIndex
out
}
customMetricMaker <- function(customMetricIndex=paste0("metric",1:200)){
assertthat::assert_that(is.character(customMetricIndex))
indexes <- grep("^metric(.+)", customMetricIndex)
if(length(indexes) < 1) stop("No custom metrics found")
metricXX <- "MAX(IF (hits.customMetrics.index = XX, hits.customMetrics.value, NULL)) WITHIN RECORD AS metricXX"
out <- vapply(indexes, function(i) gsub("XX", i, metricXX), character(1))
names(out) <- customMetricIndex
out
}
lookup_bq_query_m <- c(visits = "SUM(totals.visits) as sessions",
sessions = "SUM(totals.visits) as sessions",
pageviews = "SUM(totals.pageviews) as pageviews",
timeOnSite = "SUM(totals.timeOnSite) as timeOnSite",
bounces = "SUM(totals.bounces) as bounces",
bounceRate = "(SUM(totals.bounces)/SUM(totals.visits))*100 as bounceRate",
transactions = "SUM(totals.transactions) as transactions",
transactionRevenue = "SUM(totals.transactionRevenue)/1000000 as transactionRevenue",
transactionsPerSession = "(SUM(totals.transactions) / SUM(totals.visits)) as transactionsPerSession",
revenuePerTransaction = "(SUM(totals.transactionRevenue)/1000000) / SUM(totals.transactions) as revenuePerTransaction",
newSessions = "SUM(totals.newVisits) as newVisits",
percentNewSessions = "(SUM(totals.newVisits) / SUM(totals.visits))*100 AS percentNewSessions",
screenviews = "SUM(totals.screenviews) as screenviews",
uniqueScreenviews = "SUM(totals.uniqueScreenviews) as uniqueScreenviews",
timeOnScreen = "SUM(totals.timeOnScreen) as timeOnScreen",
users = "COUNT(fullVisitorId) as users",
exits = "COUNT(hits.isExit) as exits",
entrances = "COUNT(hits.isEntrance) as entrances",
eventValue = "SUM(hits.eventinfo.eventValue) as eventValue")
# lookup_bq_dimensions <- meta[meta$type == "DIMENSION","name"]
lookup_bq_query_d <- c(referralPath = "trafficSource.referralPath as referralPath",
hitTimestamp = "(visitStartTime + (hits.time/1000)) as hitTimestamp",
campaign = "trafficSource.campaign as campaign",
source = "trafficSource.source as source",
medium = "trafficSource.medium as medium",
keyword = "trafficSource.keyword as keyword",
adContent = "trafficSource.adContent as adContent",
adwordsCampaignID = "trafficSource.adwordsClickInfo.campaignId as adwordsCampaignId",
adwordsAdGroupID = "trafficSource.adwordsClickInfo.adGroupId as adwordsAdGroupId",
# adwords...etc...
transactionId = "hits.transaction.transactionId as transactionId",
date = "date",
fullVisitorId = "fullVisitorId",
userId = "userId",
visitorId = "visitorId",
visitId = "visitId",
visitStartTime = "visitStartTime",
visitNumber = "visitNumber",
browser = "device.browser as browser",
browserVersion = "device.browserVersion as browserVersion",
operatingSystem = "device.operatingSystem as operatingSystem",
operatingSystemVersion = "device.operatingSystemVersion as operatingSystemVersion",
mobileDeviceBranding = "device.mobileDeviceBranding as mobileDeviceBranding",
flashVersion = "device.flashVersion as flashVersion",
language = "device.language as language",
screenColors = "device.screenColors as screenColors",
screenResolution = "device.screenResolution as screenResolution",
deviceCategory = "device.deviceCategory as deviceCategory",
continent = "geoNetwork.continent as continent",
subContinent = "geoNetwork.subContinent as subContinent",
country = "geoNetwork.country as country",
region = "geoNetwork.region as region",
metro = "geoNetwork.region as metro",
pagePath = "hits.page.pagePath as pagePath",
sourcePropertyDisplayName = "hits.sourcePropertyInfo.sourcePropertyDisplayName as sourcePropertyDisplayName",
eventCategory = "hits.eventInfo.eventCategory as eventCategory",
eventAction = "hits.eventInfo.eventAction as eventAction",
eventLabel = "hits.eventInfo.eventLabel as eventLabel",
## from http://www.lunametrics.com/blog/2016/06/23/google-analytics-bigquery-export-schema/
landingPagePath = "FIRST(IF(hits.type = 'PAGE', hits.page.pagePath, NULL)) WITHIN RECORD AS landingPagePath")
#' Example queries to add
#' SELECT COUNT(1) as unique_pageviews
#'
#' Unique pageviews:
# FROM (
# SELECT
# hits.page.pagePath,
# hits.page.pageTitle,
# fullVisitorId,
# visitNumber,
# COUNT(1) as hits
# FROM [my_table]
# WHERE hits.type='PAGE'
# GROUP BY
# hits.page.pagePath,
# hits.page.pageTitle,
# fullVisitorId,
# visitNumber
# )
## custom dimensions
# SELECT
# fullvisitorid,
# concat(string(visitid), fullvisitorid) as sessionid,
# hits.hitnumber as hitnumber,
# max(case when hits.customdimensions.index = 7 then hits.customdimensions.value end) Author,
# max(case when hits.customdimensions.index = 8 then hits.customdimensions.value end) Category,
# max(case when hits.customdimensions.index = 9 then hits.customdimensions.value end) ISBN,
# max(case when hits.customdimensions.index = 10 then hits.customdimensions.value end) Action
# FROM 123456.ga_sessions_YYYYMMDD
# GROUP EACH BY fullvisitorid,sessionid, hitnumber
## or this?
# custom dimension at the hit level
# SELECT fullVisitorId, visitId, hits.hitNumber, hits.time,
# MAX(IF(hits.customDimensions.index=1,
# hits.customDimensions.value,
# NULL)) WITHIN hits AS customDimension1,
# FROM [tableID.ga_sessions_20150305]
# LIMIT 100
#
#
# custom dimension at the session level
# SELECT fullVisitorId, visitId,
# MAX(IF(customDimensions.index=2,
# customDimensions.value,
# NULL)) WITHIN RECORD AS customDimension2,
# FROM [tableID.ga_sessions_20150305]
# LIMIT 100
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.