#' Parse categories in product detail page
#'
#' Returns all categories (titles) in the detailed product
#' description page. The categories are returned in the order
#' that they appear in on the page, and the categories might
#' not be identical on the detailed product description pages
#' of different products within the same category.
#'
#' @param detailpagehtml html structure from a single geizhals page
#' listing details of a specific item.
#'
#' @return A character vector with the category names listed
#' in the specific geizhals page.
#'
#' @examples
#' \dontrun{
#' ## get data from multiple geizhals category pages:
#' url_geizhals <- "https://geizhals.at/?cat=acam35"
#' listpagehtml_list <- fetch_all_listpages(url_geizhals, max_pages = 2)
#' dat_listpage <- parse_all_listpages(listpagehtml_list)
#' ## get url of a single detail page and read html:
#' url_detailpage <- dat_listpage[["detailpage_url"]][1]
#' detailpagehtml <- xml2::read_html(url_detailpage)
#' ## get categories:
#' parse_detailpage_categories(detailpagehtml)
#' }
#'
#' @export
parse_detailpage_categories <- function(detailpagehtml) {
ret <- detailpagehtml %>%
rvest::html_nodes(css = "#productdata") %>%
rvest::html_nodes(css = ".gh-data-table__key") %>%
rvest::html_text()
return(ret)
}
#' Parse categories and their values in product detail page
#'
#' Returns all categories (titles) and their values
#' in the detailed product description page. The categories
#' are returned in the order that they appear in on the page,
#' and the categories might not be identical on the detailed
#' product description pages of different products within
#' the same category.
#'
#' @inheritParams parse_detailpage_categories
#'
#' @return A tibble (data.frame) with two columns (key and value),
#' containing the categories and their values.
#'
#' @examples
#' \dontrun{
#' ## get data from multiple geizhals category pages:
#' url_geizhals <- "https://geizhals.at/?cat=acam35"
#' listpagehtml_list <- fetch_all_listpages(url_geizhals, max_pages = 2)
#' dat_listpage <- parse_all_listpages(listpagehtml_list)
#' ## get url of a single detail page and read html:
#' url_detailpage <- dat_listpage[["detailpage_url"]][1]
#' detailpagehtml <- xml2::read_html(url_detailpage)
#' ## get categories and their values:
#' parse_keyval_tbl(detailpagehtml)
#' }
#'
#' @export
parse_keyval_tbl <- function(detailpagehtml) {
## if there is data...
if (!is.na(detailpagehtml)) {
## ...get keys (categories):
keys <- parse_detailpage_categories(detailpagehtml)
## get values:
vals <- detailpagehtml %>%
rvest::html_nodes(css = "#productdata") %>%
rvest::html_nodes(css = ".gh-data-table__value") %>%
rvest::html_text()
## make data.frame:
ret <- tibble::tibble(key = keys, value = vals)
## remove duplicates:
ret <- ret[!duplicated(ret), ]
} else {
## if there is no data, just return the same structure with NA's:
ret <- tibble::tibble(key = NA, value = NA)
}
return(ret)
}
#' Parse price list in product detail page
#'
#' Returns all price values from the price list
#' in the detailed product description page.
#'
#' @inheritParams parse_detailpage_categories
#'
#' @return A numeric vector containing the prices.
#'
#' @examples
#' \dontrun{
#' ## get data from multiple geizhals category pages:
#' url_geizhals <- "https://geizhals.at/?cat=acam35"
#' listpagehtml_list <- fetch_all_listpages(url_geizhals, max_pages = 2)
#' dat_listpage <- parse_all_listpages(listpagehtml_list)
#' ## get url of a single detail page and read html:
#' url_detailpage <- dat_listpage[["detailpage_url"]][1]
#' detailpagehtml <- xml2::read_html(url_detailpage)
#' ## get prices:
#' parse_prices(detailpagehtml)
#' }
#'
#' @export
parse_prices <- function(detailpagehtml) {
## get prices:
ret <- detailpagehtml %>%
rvest::html_nodes(css = ".offer__price") %>% ## get prices
rvest::html_text()
## remove first entry (table header):
ret <- ret[-1]
## convert to numerical:
ret <- ret %>%
## get first occurenc of a number:
stringr::str_extract("^.*[0-9,]{1,}") %>%
## get only the number:
stringr::str_extract("[0-9]{1,},[0-9]{0,}") %>%
## get numerical parts only:
stringr::str_replace_all("[^0-9,]", "") %>%
## "," comma to "." comma:
stringr::str_replace_all(",", "\\.") %>%
as.numeric()
return(ret)
}
#' Calculate a summary of prices in product detail page
#'
#' Returns a summary of all price values from the price list
#' in the detailed product description page. Currently,
#' this summary contains the 3 lowest prices (or \code{NA} if
#' there aren't enough prices on that page), and the median
#' of all prices.
#'
#' @inheritParams parse_detailpage_categories
#'
#' @return A tibble (data.frame) with two columns (key and value),
#' containing the price summary results (key being a descriptive
#' key like \code{price_min}, value being the respective summary
#' measure of the prices). The value column is of type
#' \code{character}, in order to be row-binded to the categories
#' and their values (which are also of type \code{character}).
#'
#' @examples
#' \dontrun{
#' ## get data from multiple geizhals category pages:
#' url_geizhals <- "https://geizhals.at/?cat=acam35"
#' listpagehtml_list <- fetch_all_listpages(url_geizhals, max_pages = 2)
#' dat_listpage <- parse_all_listpages(listpagehtml_list)
#' ## get url of a single detail page and read html:
#' url_detailpage <- dat_listpage[["detailpage_url"]][1]
#' detailpagehtml <- xml2::read_html(url_detailpage)
#' ## get prices summary:
#' calc_price_summary(detailpagehtml)
#' }
#'
#' @export
calc_price_summary <- function(detailpagehtml) {
## if there is html data...
if (!is.na(detailpagehtml)) {
prices <- parse_prices(detailpagehtml = detailpagehtml)
## sort (just in case):
prices <- sort(prices)
## return summary:
ret_val <- c(min(prices), prices[2], prices[3], stats::median(prices))
ret_key <- c("price_min", "price_2nd_min", "price_3rd_min", "price_median")
ret <- tibble::tibble(
key = ret_key,
value = ret_val
)
} else {
## if there is no html data, just return NA's in a similar structure:
ret_val <- c(rep(NA, 4))
ret_key <- c("price_min", "price_2nd_min", "price_3rd_min", "price_median")
ret <- tibble::tibble(
key = ret_key,
value = ret_val
)
}
return(ret)
}
#' Parse data from product detail page
#'
#' Returns all categories (titles) and their values
#' in the detailed product description page, as well as
#' a summary of all price values from the price list
#' in the detailed product description page.
#'
#' @inheritParams parse_detailpage_categories
#'
#' @return A tibble (data.frame) with two columns (key and value),
#' containing the categories and their values, as well as the
#' price summary results (key being a descriptive
#' key like \code{price_min}, value being the respective summary
#' measure of the prices). The value column is of type
#' \code{character}.
#'
#' @examples
#' \dontrun{
#' ## get data from multiple geizhals category pages:
#' url_geizhals <- "https://geizhals.at/?cat=acam35"
#' listpagehtml_list <- fetch_all_listpages(url_geizhals, max_pages = 2)
#' dat_listpage <- parse_all_listpages(listpagehtml_list)
#' ## get url of a single detail page and read html:
#' url_detailpage <- dat_listpage[["detailpage_url"]][1]
#' detailpagehtml <- xml2::read_html(url_detailpage)
#' ## get data from detailpage:
#' parse_single_detailpage(detailpagehtml)
#' }
#'
#' @export
parse_single_detailpage <- function(detailpagehtml) {
## get data:
ret_keyval <- parse_keyval_tbl(detailpagehtml)
ret_price_summary <- calc_price_summary(detailpagehtml)
## modify data types (all character, currently):
ret_price_summary[["value"]] <- as.character(ret_price_summary[["value"]])
ret <- dplyr::bind_rows(
ret_keyval,
ret_price_summary
)
return(ret)
}
#' Fetch html of detailpage urls
#'
#' Retrieve the html code for a vector of detailpage urls, returning
#' the urls as well as the html code.
#'
#' @param detailpageurls A character vector containing urls to
#' sub-pages with detailed product descriptions (as found when following
#' a link in the listing page).
#' @param max_items A numeric (integer) vector of length one, specifying
#' the maximum number of items to scrape. (Default: \code{Inf}).
#' If \code{max_items} is smaller than the length of the passed urls
#' in \code{detailpageurls}, only the first \code{max_items} entries
#' are fetched.
#' @param delay_detailpage Number of seconds to wait after fetching
#' html of each detailpage (defaults to \code{NA}).
#'
#' @return A list of length two. The first element, \code{url}, contains
#' the vector of urls that was passed to the function. The second list
#' element, \code{html}, contains another list with one entry per url,
#' containing the html.
#'
#' @examples
#' \dontrun{
#' ## first, get data from all listing pages:
#' url_geizhals <- "https://geizhals.at/?cat=acam35"
#' listpagehtml_list <- fetch_all_listpages(url_geizhals, max_pages = 2)
#' dat_listpages <- parse_all_listpages(listpagehtml_list)
#'
#' ## now, get (first three) detailpages:
#' urls <- dat_listpages$detailpage_url
#' detailpagehtml_list <- fetch_all_detailpage_html(urls, max_items = 3,
#' delay_detailpage = 1)
#' detailpagehtml_list
#' }
#'
#' @export
fetch_all_detailpage_html <- function(detailpageurls, max_items = Inf,
delay_detailpage = NA) {
## check if there are more urls than max_items:
if (length(detailpageurls) > max_items) {
detailpageurls <- detailpageurls[1:max_items]
}
## get html for all urls:
ret <- list(
url = detailpageurls,
html = purrr::map(detailpageurls, function(i) {
message("Fetching detailpage ", i, "...")
ret <- try(xml2::read_html(i), silent = TRUE)
if (!is.na(delay_detailpage)) {
message("Sleeping for ", delay_detailpage, " second(s)...")
Sys.sleep(delay_detailpage)
}
## if it fails, return NA:
if (class(ret)[1] == "try-error") {
warning("Something unexpected happened when fetching listpage. \n",
"(", ret[1], ")\n",
"Returning NA instead of web page html.")
ret <- NA
}
return(ret)
})
)
message("Done.")
return(ret)
}
#' Parse data from multiple product detail pages
#'
#' Returns all categories and their values in a list of
#' detailed product description pages, as well as
#' a summary of all price values from the price list
#' in each of the detailed product description pages.
#' In contrast to the \code{parse_single_detailpage}
#' function, the categories describing a product are
#' the columns, and each product is represented as a
#' row in the resulting tibble (data.frame).
#' The tibble has as many columns as there are categories,
#' if a product doesn't feature all categories in its
#' description, this column will be \code{NA}. Column
#' types are inferred from the data automatically.
#' If \code{returntype} is specified to be \code{"list"},
#' the data is returned as a list, without combining
#' the data into a data frame.
#'
#' @param detailpagehtml_list A list of html structure
#' from multiple geizhals page listing details of a
#' specific item.
#' @param returntype Either \code{"list"} or
#' \code{"data.frame"} (default).
#'
#' @return A tibble (data.frame) with as many columns
#' as there are distinct categories in all feature
#' pages, and as many rows as there are products
#' for \code{returntype = "data.frame"}. Otherwise,
#' a list where each list entry containes the
#' parsed data from a single detailpage (not
#' necessarily with each list entry having the
#' same categories).
#'
#' @examples
#' \dontrun{
#' ## get data from multiple geizhals category pages:
#' url_geizhals <- "https://geizhals.at/?cat=acam35"
#' listpagehtml_list <- fetch_all_listpages(url_geizhals, max_pages = 2)
#' dat_listpage <- parse_all_listpages(listpagehtml_list)
#' ## pick only the three first detailpage urls:
#' wch_detailpage_urls <- dat_listpage[["detailpage_url"]][1:3]
#' detailpagehtml_list <- fetch_all_detailpage_html(wch_detailpage_urls)
#' ## get data from all detailpages:
#' dat_detailpages <- parse_all_detailpages(detailpagehtml_list)
#' head(dat_detailpages)
#' ## get the same data as a list:
#' dat_detailpages_list <- parse_all_detailpages(detailpagehtml_list,
#' returntype = "list")
#' head(dat_detailpages_list)
#' }
#'
#' @export
parse_all_detailpages <- function(detailpagehtml_list,
returntype = "data.frame")
{
## get detailpage tibble:
singlepage_list <- purrr::map(
detailpagehtml_list$html,
parse_single_detailpage)
## add url to tibble (to serve as join key later):
singlepage_list_with_url <- purrr::map2(
singlepage_list, detailpagehtml_list$url,
~ dplyr::bind_rows(
tibble::tibble(key = "url", value = .y),
.x
))
## if return type should be list, then just return this list:
if (returntype == "list")
return(singlepage_list_with_url)
## otherwise, build data.frame:
detaildat_long <- combine_detailpages(singlepage_list_with_url)
return(detaildat_long)
}
#' Combine list of detailpages into data.frame
#'
#' Takes a list of parsed detailpages and combines them
#' into a data.frame. The categories describing
#' a product are
#' the columns, and each product is represented as a
#' row in the resulting tibble (data.frame).
#' The tibble has as many columns as there are categories,
#' if a product doesn't feature all categories in its
#' description, this column will be \code{NA}. Column
#' types are inferred from the data automatically.
#'
#' @param singlepage_list_with_url A list of parsed
#' detailpages as returned by the function
#' \code{parse_all_detailpages} when used with
#' \code{returntype = "list"}.
#'
#' @return A tibble (data.frame) with as many columns
#' as there are distinct categories in all feature
#' pages, and as many rows as there are products.
#'
#' @examples
#' \dontrun{
#' ## get data from multiple geizhals category pages:
#' url_geizhals <- "https://geizhals.at/?cat=acam35"
#' listpagehtml_list <- fetch_all_listpages(url_geizhals, max_pages = 2)
#' dat_listpage <- parse_all_listpages(listpagehtml_list)
#' ## pick only the three first detailpage urls:
#' wch_detailpage_urls <- dat_listpage[["detailpage_url"]][1:3]
#' detailpagehtml_list <- fetch_all_detailpage_html(wch_detailpage_urls)
#' ## get the same data as a list:
#' dat_detailpages_list <- parse_all_detailpages(detailpagehtml_list,
#' returntype = "list")
#' ## combine to data.frame:
#' combine_detailpages(dat_detailpages_list)
#' }
#'
#' @export
combine_detailpages <- function(singlepage_list_with_url) {
## get a list of all keys in all of the detailpage tibbles:
all_keys <- purrr::map(singlepage_list_with_url, ~ .x[["key"]]) %>%
unlist() %>% unique()
## build tibble by joining all detailpage tibbles to tibble with keys,
## and transposing wide to long:
detaildat_wide <- tibble::tibble(key = all_keys)
detaildat_long <- NULL
# detaildat_wide_1 <- dplyr::left_join(detaildat_wide,
# singlepage_list_with_url[[1]],
# by = "key")
for (i in seq_along(singlepage_list_with_url)) {
detaildat_wide_tmp <- dplyr::left_join(
detaildat_wide, singlepage_list_with_url[[i]], by = "key")
detaildat_long_tmp <- tidyr::spread(
detaildat_wide_tmp, key = "key", value = "value")
detaildat_long <- dplyr::bind_rows(detaildat_long, detaildat_long_tmp)
}
## since all columns are of type are character, guess the
## correct types:
detaildat_long <- dplyr::mutate_all(detaildat_long,
readr::parse_guess)
return(detaildat_long)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.