#' Google Flights
#'
#' Google Flights is an online flight booking service that lets you organize a
#' search by number of stopovers.
#'
#' @param type It must be one of two options: 'collect' or 'scrap'.
#' Where 'collect' collect the data and store in
#' ~/WEBDATA/GOOGLEFLIGHTS/DATA and 'scrap' scrape the information from the
#' html files stored.
#'
#' @section storage:
#'
#' The data is stored in the follow folder ~/WEBDATA/GOOGLEFLIGHTS/. The files
#' are saved as html pages and posteriorly the information inside will be
#' collect.
#'
#' @export
googleflights <- function(type) {
if (dir.exists("~/WEBDATA/GOOGLEFLIGHTS/")) {
setwd("~/WEBDATA/GOOGLEFLIGHTS/DATA")
if (type == "collect") {
remDr <- RSelenium::remoteDriver(
remoteServerAddr = "localhost",
port = 4445L,
browserName = "firefox"
)
remDr$open()
voo <- read.csv2("../args.csv", stringsAsFactors = FALSE)
voo$df <- as.Date(voo$df)
voo <- voo %>% dplyr::filter(df >= Sys.Date())
if (nrow(voo) == 0) stop("Please complete the initial file.")
## Verify if the file had more than 0 rows
dir.create(as.character(Sys.Date()))
setwd(as.character(Sys.Date()))
for (i in 1:nrow(voo)) {
## Create the dir to ID
if (!dir.exists(as.character(voo[i, ]$id))) {
dir.create(as.character(voo[i, ]$id))
}
to <- seq.Date(as.Date(voo$di[i]), as.Date(voo$df[i]), by = 1)
url <- sapply(to, function(x) gsub("[0-9]{4}-[0-9]{2}-[0-9]{2}", x, voo$url[i]))
for (w in 1:length(url)) {
remDr$navigate(url[w])
Sys.sleep(2)
h <- XML::htmlParse(remDr$getPageSource()[[1]])
file <- paste0(voo$id[i], "/", to[w], ".html")
XML::saveXML(h, file = file)
}
}
remDr$close()
} else if (type == "scrap") {
# TODO
# h <- htmlParse("2019_07_07/2019_10_06_1.html")
#
# xpath <- "gws-flights-results__result-item gws-flights__flex-box gws-flights-results__collapsed"
# h.sub <- getNodeSet(h, paste0("//li[@class='", xpath, "']"))
#
# # Loop
# v <- xmlDoc(h.sub[[1]])
#
# # Preço ------------------------------------------------------
# xpath <- "//div[@class='gws-flights-results__itinerary-price']"
# preco <- xpathApply(v, xpath, xmlValue)
#
# # De ---------------------------------------------------------
# xpath <- "//div[@class='gws-flights-results__leg-arrival gws-flights__flex-box flt-subhead1Normal']"
# de <- xpathApply(v, xpath, xmlValue)
#
# # Para -------------------------------------------------------
# xpath <- "//div[@class='gws-flights-results__leg-departure gws-flights__flex-box flt-subhead1Normal']"
# para <- xpathApply(v, xpath, xmlValue)
#
# # TME --------------------------------------------------------
# xpath <- "//div[@class='gws-flights-results__leg-duration gws-flights__flex-box flt-body2']"
# tme <- xpathApply()
} else {
cat("\nProvide one of the two option to the arg type: 'collect' or 'scrap'.",
"\n\tSee: ?WEBDATA::googleflights()\n")
}
} else {
## This step runs only once
if (!dir.exists("~/WEBDATA")) {
dir.create("~/WEBDATA")
}
dir.create("~/WEBDATA/GOOGLEFLIGHTS/")
setwd("~/WEBDATA/GOOGLEFLIGHTS")
dir.create("~/WEBDATA/GOOGLEFLIGHTS/DATA")
url <- data.frame(matrix(ncol = 6, nrow = 0), stringsAsFactors = FALSE)
colnames(url) <- c("id", "de", "para", "di", "df", "url")
write.csv2(url, row.names = FALSE, file = "args.csv")
cat("\nIt is the first time you run this function.",
"\nPlease fill the file args.csv in ~/WEBDATA/GOOGLEFLIGHTS ",
"following the instructions in the args section.\n")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.