tests/googleflights.R

#' Google Flights
#'
#' Google Flights is an online flight booking service that lets you organize a
#' search by number of stopovers.
#' 
#' @param type It must be one of two options: 'collect' or 'scrap'.
#'     Where 'collect' collect the data and store in
#'     ~/WEBDATA/GOOGLEFLIGHTS/DATA and 'scrap' scrape the information from the
#'     html files stored.
#'
#' @section storage:
#'
#' The data is stored in the follow folder ~/WEBDATA/GOOGLEFLIGHTS/. The files
#' are saved as html pages and posteriorly the information inside will be
#' collect.
#' 
#' @export

googleflights <- function(type) {

    if (dir.exists("~/WEBDATA/GOOGLEFLIGHTS/")) {
        setwd("~/WEBDATA/GOOGLEFLIGHTS/DATA")

        if (type == "collect") {
            remDr <- RSelenium::remoteDriver(
                                    remoteServerAddr = "localhost",
                                    port = 4445L,
                                    browserName = "firefox"
                                )
            remDr$open()

            voo <- read.csv2("../args.csv", stringsAsFactors = FALSE)
            voo$df <- as.Date(voo$df)
            voo <- voo %>% dplyr::filter(df >= Sys.Date())
            if (nrow(voo) == 0) stop("Please complete the initial file.")
            ## Verify if the file had more than 0 rows

            dir.create(as.character(Sys.Date()))
            setwd(as.character(Sys.Date()))

            for (i in 1:nrow(voo)) {
                ## Create the dir to ID
                if (!dir.exists(as.character(voo[i, ]$id))) {
                    dir.create(as.character(voo[i, ]$id))
                }
                
                to <- seq.Date(as.Date(voo$di[i]), as.Date(voo$df[i]), by = 1)
                url <- sapply(to, function(x) gsub("[0-9]{4}-[0-9]{2}-[0-9]{2}", x, voo$url[i]))

                for (w in 1:length(url)) {
                    remDr$navigate(url[w])
                    Sys.sleep(2)
                    h <- XML::htmlParse(remDr$getPageSource()[[1]])
                    file <- paste0(voo$id[i], "/", to[w], ".html")
                    XML::saveXML(h, file = file)
                }
            }

            remDr$close()
            
        } else if (type == "scrap") {
            # TODO
            # h <- htmlParse("2019_07_07/2019_10_06_1.html")
            # 
            # xpath <- "gws-flights-results__result-item gws-flights__flex-box gws-flights-results__collapsed"
            # h.sub <- getNodeSet(h, paste0("//li[@class='", xpath, "']"))
            # 
            # # Loop
            # v <- xmlDoc(h.sub[[1]])
            # 
            # # Preço ------------------------------------------------------
            # xpath <- "//div[@class='gws-flights-results__itinerary-price']"
            # preco <- xpathApply(v, xpath, xmlValue)
            # 
            # # De ---------------------------------------------------------
            # xpath <- "//div[@class='gws-flights-results__leg-arrival gws-flights__flex-box flt-subhead1Normal']"
            # de <- xpathApply(v, xpath, xmlValue)
            # 
            # # Para -------------------------------------------------------
            # xpath <- "//div[@class='gws-flights-results__leg-departure gws-flights__flex-box flt-subhead1Normal']"
            # para <- xpathApply(v, xpath, xmlValue)
            # 
            # # TME --------------------------------------------------------
            # xpath <- "//div[@class='gws-flights-results__leg-duration gws-flights__flex-box flt-body2']"
            # tme <- xpathApply()

        } else {
            cat("\nProvide one of the two option to the arg type: 'collect' or 'scrap'.",
                "\n\tSee: ?WEBDATA::googleflights()\n")
        }
    } else {
        ## This step runs only once
        if (!dir.exists("~/WEBDATA")) {
            dir.create("~/WEBDATA")
        }

        dir.create("~/WEBDATA/GOOGLEFLIGHTS/")
        setwd("~/WEBDATA/GOOGLEFLIGHTS")

        dir.create("~/WEBDATA/GOOGLEFLIGHTS/DATA")
        
        url <- data.frame(matrix(ncol = 6, nrow = 0), stringsAsFactors = FALSE)
        colnames(url) <- c("id", "de", "para", "di", "df", "url")
        write.csv2(url, row.names = FALSE, file = "args.csv")
        cat("\nIt is the first time you run this function.",
            "\nPlease fill the file args.csv in ~/WEBDATA/GOOGLEFLIGHTS ", 
            "following the instructions in the args section.\n")
    }
}
Andryas/WEBDATA documentation built on Jan. 2, 2020, 1:31 p.m.