R/core.R

rga$methods(
    list(
        getData = function(ids, start.date = format(Sys.Date() - 8, "%Y-%m-%d"),
                           end.date = format(Sys.Date() - 1, "%Y-%m-%d"), date.format = "%Y-%m-%d",
                           metrics = "ga:users,ga:sessions,ga:pageviews", dimensions = "ga:date",
                           sort = "", filters = "", segment = "", fields = "",
                           start = 1, max, messages = TRUE,  batch, walk = FALSE,
                           output.raw, output.formats, return.url = FALSE, rbr = FALSE, envir = .GlobalEnv,
                           samplingLevel = "HIGHER_PRECISION") {

            if (missing(ids)) {
                stop("please enter a profile id")
            }

            if (missing(batch) || batch == FALSE) {
                isBatch <- FALSE
                if (missing(max)) {
                    # standard
                    max <- 1000
                }
            } else {
                isBatch <- TRUE
                if (!is.numeric(batch)) {
                    if (!missing(max) && max < 10000) {
                        # no need
                        batch <- max
                    } else {
                        # max batch size
                        batch <- 10000
                    }
                } else {
                    if (batch > 10000) {
                        # as per https://developers.google.com/analytics/devguides/reporting/core/v3/reference#maxResults
                        stop("batch size can be set to max of 10000")
                    }
                }

                adjustMax <- TRUE
                # arbitrary target, adjust later
                max <- 10000
            }

            # ensure that profile id begings with 'ga:'
            if (!grepl("ga:", ids)) {
                ids <- paste("ga:", ids, sep = "")
            }

            # remove whitespaces from metrics and dimensions
            metrics <- gsub("\\s", "", metrics)
            dimensions <- gsub("\\s", "", dimensions)

            # build url with variables
            url <- "https://www.googleapis.com/analytics/v3/data/ga"
            query <- paste(paste("access_token", .self$getToken()$access_token, sep = "="),
                           paste("ids", ids, sep = "="),
                           paste("start-date", start.date, sep = "="),
                           paste("end-date", end.date, sep = "="),
                           paste("metrics", metrics, sep = "="),
                           paste("dimensions", dimensions, sep = "="),
                           paste("start-index", start, sep = "="),
                           paste("max-results", max, sep = "="),
                           paste("samplingLevel", samplingLevel, sep = "="),
                           sep = "&")

            if (sort != "") {
                query <- paste(query, paste("sort", sort, sep = "="), sep = "&")
            }
            if (segment != "") {
                query <- paste(query, paste("segment", segment, sep = "="), sep = "&")
            }
            if (fields != "") {
                query <- paste(query, paste("fields", fields, sep = "="), sep = "&")
            }
            if (filters != "") {
                # available operators
                ops <- c("==", "!=", ">", "<", ">=", "<=", "=@", "!@", "=-", "!-", "\\|\\|", "&&", "OR", "AND")
                # make pattern for gsub
                opsw <- paste("(\\ )+(", paste(ops, collapse = "|"), ")(\\ )+", sep = "")
                # remove whitespaces around operators
                filters <- gsub(opsw, "\\2", filters)
                # replace logical operators
                filters <- gsub("OR|\\|\\|", ",", filters)
                filters <- gsub("AND|&&", ";", filters)
                query <- paste(query, paste("filters", curlEscape(filters), sep = "="), sep = "&", collapse = "")
            }

            url <- paste(url, query = query, sep = "?")

            if (return.url) {
                return(url)
            }

            # thanks to Schaun Wheeler this will not provoke the weird SSL-bug
            if (.Platform$OS.type == "windows") {
                options(RCurlOptions = list(
                    verbose = FALSE,
                    capath = system.file("CurlSSL", "cacert.pem",
                    package = "RCurl"), ssl.verifypeer = FALSE))
            }

            # get data and convert from json to list-format
            # switched to use httr and jsonlite
            request <- GET(url)
            ga.data <- jsonlite::fromJSON(content(request, "text"))

            # possibility to extract the raw data
            if (!missing(output.raw)) {
                assign(output.raw, ga.data, envir = envir)
            }

            # output error and stop
            if (!is.null(ga.data$error)) {
                stop(paste("error in fetching data: ", ga.data$error$message, sep = ""))
            }

            if (ga.data$containsSampledData == "TRUE") {
                isSampled <- TRUE
                if (!walk) {
                    message(sprintf("Notice: Data set sampled from %s sessions (%d%% of all sessions)",
                                    format(as.numeric(ga.data$sampleSize), big.mark=",", scientific=FALSE),
                                    round((as.numeric(ga.data$sampleSize) / as.numeric(ga.data$sampleSpace) * 100))))
                }
            } else {
                isSampled <- FALSE
            }

            if (isSampled && walk) {
                return(.self$getDataInWalks(total = ga.data$totalResults, max = max, batch = batch,
                                            ids = ids, start.date = start.date, end.date = end.date, date.format = date.format,
                                            metrics = metrics, dimensions = dimensions, sort = sort, filters = filters,
                                            segment = segment, fields = fields, envir = envir, samplingLevel = samplingLevel))
            }

            # check if all data is being extracted
            if (NROW(ga.data$rows) < ga.data$totalResults && (messages || isBatch)) {
                if (!isBatch) {
                    message(paste("Only pulling", NROW(ga.data$rows), "observations of", ga.data$totalResults, "total (set batch = TRUE to get all observations)"))
                } else {
                    if (adjustMax) {
                        max <- ga.data$totalResults
                    }
                    message(paste("Batch: pulling", max, "observations in batches of", batch))
                    # pass variables to batch-function
                    return(.self$getDataInBatches(total = ga.data$totalResults, max = max, batchSize = batch,
                                                  ids = ids, start.date = start.date, end.date = end.date, date.format = date.format,
                                                  metrics = metrics, dimensions = dimensions, sort = sort, filters = filters,
                                                  segment = segment, fields = fields, envir = envir, samplingLevel = samplingLevel))
                }
            }

            # get column names
            ga.headers <- ga.data$columnHeaders
            # remove ga: from column headers
            ga.headers$name <- sub("ga:", "", ga.headers$name)

            # did not return any results
            if (!inherits(ga.data$rows, "matrix") && !rbr) {
                stop(paste("no results:", ga.data$totalResults))
            } else if (!inherits(ga.data$rows, "matrix") && rbr) {
              # If row-by-row is true, return NULL
              return(NULL)
            }

            # convert to data.frame
            ga.data.df <- as.data.frame(ga.data$rows, stringsAsFactors = FALSE)
            # insert column names
            names(ga.data.df) <- ga.headers$name

            # check if sampled; add attributes if so
            if (isSampled) {
                attr(ga.data.df, "containsSampledData") <- TRUE
                attr(ga.data.df, "sampleSize") <- as.numeric(ga.data$sampleSize)
                attr(ga.data.df, "sampleSpace") <- as.numeric(ga.data$sampleSpace)
            } else {
                attr(ga.data.df, "containsSampledData") <- FALSE
            }

            # find formats
            formats <- ga.headers

            # convert to r friendly
            formats$dataType[formats$dataType %in% c("INTEGER", "PERCENT", "TIME", "CURRENCY", "FLOAT")] <- "numeric"
            formats$dataType[formats$dataType == "STRING"] <- "character"
            # addition rules
            formats$dataType[formats$name %in% c("latitude", "longitude")] <- "numeric"
            formats$dataType[formats$name %in% c("year", "month", "week", "day", "hour", "minute", "nthMonth", "nthWeek", "nthDay", "nthHour", "nthMinute", "dayOfWeek", "sessionDurationBucket", "visitLength", "daysSinceLastVisit", "daysSinceLastSession", "visitCount", "sessionCount", "sessionsToTransaction", "daysToTransaction")] <- "ordered"
            formats$dataType[formats$name == "date"] <- "Date"

            if ("date" %in% ga.headers$name) {
                ga.data.df$date <- format(as.Date(ga.data.df$date, "%Y%m%d"), date.format)
            }

            # looping through columns and setting classes
            for (i in 1:nrow(formats)) {
                column <- formats$name[i]
                class <- formats$dataType[[i]]
                if (!exists(paste("as.", class, sep = ""), mode = "function")) {
                    stop(paste("can't find function for class", class))
                } else {
                    as.fun <- match.fun(paste("as.", class, sep = ""))
                }
                if (class == "ordered") {
                    ga.data.df[[column]] <- as.numeric(ga.data.df[[column]])
                }
                ga.data.df[[column]] <- as.fun(ga.data.df[[column]])
            }

            # mos-def optimize
            if (!missing(output.formats)) {
                assign(output.formats, formats, envir = envir)
            }

            # and we're done
            return(ga.data.df)
        },
        getFirstDate = function(ids) {
            first <- .self$getData(ids, start.date = "2005-01-01", filters = "ga:hits!=0", max = 1, messages = FALSE)
            return(first$date)
        },
        getDataInBatches = function(batchSize, total, ids, start.date, end.date, date.format,
                                    metrics, max, dimensions, sort, filters, segment, fields, envir,
                                    samplingLevel) {
            runs.max <- ceiling(max/batchSize)
            chunk.list <- vector("list", runs.max)
            for (i in 0:(runs.max - 1)) {
                start <- i * batchSize + 1
                end <- start + batchSize - 1
                # adjust batch size if we're pulling the last batch
                if (end > max) {
                    batchSize <- max - batchSize
                    end <- max
                }

                message(paste("Batch: run (", i + 1, "/", runs.max, "), observations [", start, ":", end, "]. Batch size: ", batchSize, sep = ""))
                chunk <- .self$getData(ids = ids, start.date = start.date, end.date = end.date, metrics = metrics, dimensions = dimensions, sort = sort,
                                       filters = filters, segment = segment, fields = fields, date.format = date.format, envir = envir, messages = FALSE, return.url = FALSE,
                                       batch = FALSE, start = start, max = batchSize, samplingLevel = samplingLevel)
                message(paste("Batch: received", NROW(chunk), "observations"))
                chunk.list[[i + 1]] <- chunk
            }
            return(do.call(rbind, chunk.list, envir = envir))
        },
        getDataInWalks = function(total, max, batch, ids, start.date, end.date, date.format,
                                  metrics, dimensions, sort, filters, segment, fields, envir,
                                  samplingLevel) {
            # this function will extract data day-by-day (to avoid sampling)
            walks.max <- ceiling(as.numeric(difftime(as.Date(end.date), as.Date(start.date), units = "days")))
            chunk.list <- vector("list", walks.max + 1)

            for (i in 0:(walks.max)) {
                date <- format(as.Date(start.date) + i, "%Y-%m-%d")

                message(paste("Walk: run (", i + 1, "/", walks.max + 1, ") for date ", date, sep = ""))
                chunk <- .self$getData(ids = ids, start.date = date, end.date = date, date.format = date.format,
                                       metrics = metrics, dimensions = dimensions, sort = sort, filters = filters,
                                       segment = segment, fields = fields, envir = envir, max = max,
                                       rbr = TRUE, messages = FALSE, return.url = FALSE, batch = batch, samplingLevel = samplingLevel)
                message(paste("Walk: received", NROW(chunk), "observations"))
                chunk.list[[i + 1]] <- chunk
            }

            return(do.call(rbind, chunk.list, envir = envir))
        }
    )
)
skardhamar/rga documentation built on May 30, 2019, 1 a.m.