tests/testthat/test-scrape-detailpage.R

context("scrape detailpage")

# setwd("./tests/testthat/")   # nolint
detailpagehtml_trockner_01 <- xml2::read_html(
  system.file("extdata", "gh-dp-trockn-01.html", package = "rgeizhals"))
detailpagehtml_trockner_02 <- xml2::read_html(
  system.file("extdata", "gh-dp-trockn-02.html", package = "rgeizhals"))
detailpagehtml_nas_01 <- xml2::read_html(
  system.file("extdata", "gh-dp-nas-01.html", package = "rgeizhals"))

detailpageurls_trockner <- c(
  system.file("extdata", "gh-dp-trockn-01.html", package = "rgeizhals"),
  system.file("extdata", "gh-dp-trockn-02.html", package = "rgeizhals")
)


## ========================================================================= ##
## parse_detailpage_categories
## ========================================================================= ##

#paste0("\"", r, "\"") %>% paste(collapse = ", ") %>% paste0("c(", ., ")") %>% cat()  # nolint
#paste0("\"", r[["value"]], "\"") %>% paste(collapse = ", ") %>% paste0("c(", ., ")") %>% cat()  # nolint
#detailpagehtml <- detailpagehtml_trockner_02                                         # nolint

test_that("parse_detailpage_categories()
          parses categories correctly", {
            r <- parse_detailpage_categories(detailpagehtml_trockner_01)
            expect_equal(
              r,
              c("Energieeffizienzklasse", "Kondensationseffizienzklasse",
                "Farbe", "Fassungsvermögen", "Programmanzahl", "Programme",
                "Zusatzoptionen", "Display", "Ausstattung", "Sicherheit",
                "Wartungshinweise", "Geräuschentwicklung", "Standardprogramm",
                "Energieverbrauch", "Bauart", "Abmessungen (HxBxT)", "Gewicht",
                "Gelistet seit")
            )
            r <- parse_detailpage_categories(detailpagehtml_nas_01)
            expect_equal(
              r,
              c("Festplatte", "Intern", "Extern", "Zusätzliche Anschlüsse",
                "RAID-Level", "CPU", "RAM", "Lüfter", "Leistungsaufnahme",
                "Abmessungen (BxHxT)", "Gewicht", "Besonderheiten",
                "Herstellergarantie", "Gelistet seit")
            )
          })

## ========================================================================= ##
## parse_keyval_tbl
## ========================================================================= ##

test_that("parse_keyval_tbl()
          parses values and keys correctly", {
            r <- parse_keyval_tbl(detailpagehtml_trockner_01)
            expect_equal(dim(r),
                         c(18, 2)
            )
            expect_equal(
              r[["key"]],
              parse_detailpage_categories(detailpagehtml_trockner_01)
            )
            expect_equal(
              r[["value"]][1:12],                       ## fix this
              c("A+++", "A", "weiß", "8.00kg", "12",
                "Baumwolle, Pflegeleicht, Feinwäsche, Oberhemden, Finish Wolle, Express, Jeans, Outdoor, Imprägnieren, Lüften warm, Vorbügeln", # nolint
                "Knitterschutz, Schonen plus", "ja",
                "Restlaufzeitanzeige, Startzeitvorwahl 24h, Signal am Programmende, Trommelbeleuchtung, Anschlussmöglichkeit und Schlauch für direkten Kondenswasserablauf, Reversierautomatik, Duftautomatik (FragranceDos), wartungsfreier Kondensator",  # nolint
                "Kindersicherung",
                "Anzeige für Filter reinigen, Kondensatbehälter leeren",
                "64dB(A)") #,
                #"177min", "171kWh/\vJahr",
                #"Standgerät, unterbaufähig, säulenfähig",
                #"850x596x636mm", "59.00kg", "27.03.2017, 13:03")
            )

            r <- parse_keyval_tbl(detailpagehtml_nas_01)
            expect_equal(dim(r),
                         c(14, 2)
            )
            expect_equal(
              r[["key"]],
              parse_detailpage_categories(detailpagehtml_nas_01)
            )
            expect_equal(
              r[["value"]][-c(1, 2, 4, 5)],            ## fix this
              c(# "N/A", "10x 2.5\"/3.5\", SATA 6Gb/s, Hot-Swap",
                "4x Gb LAN",
                # "3x USB-A 3.0 (Host), 2x USB-A 2.0 (Host), 2x eSATA 6Gb/s, 1x HDMI 1.4, 1x S/PDIF (optisch)",   # nolint
                # "0/1/5/6/10/JBOD/Single",
                "Intel Celeron N3160, 4x 1.60GHz",
                "4GB DDR3L SO-DIMM (max. 8GB)", "2x 120mm",
                "68.3W (Betrieb), 34.4W (Leerlauf)", "293x215.5x230mm",
                "6.20kg",
                "LCD, Kodi, iSCSI, 256bit AES-Verschlüsselung, FTP-Server, iTunes-Server, BitTorrent-Client, WebDAV, vier IP-Kamera Lizenzen", # nolint
                "drei Jahre (Abwicklung über ASUS)", "08.07.2016, 15:11")
            )
          })


## ========================================================================= ##
## parse_prices
## ========================================================================= ##

test_that("parse_prices()
          parses prices correctly", {
            r <- parse_prices(detailpagehtml_trockner_01)
            expect_equal(
              r,
              c(1248.99, 1249.00, 1249.00, 1299.00, 1408.00)
            )

            r <- parse_prices(detailpagehtml_nas_01)
            expect_equal(
              r,
              c(920.29, 920.30, 920.30, 920.30, 923.92, 925.21, 929.00,
                952.00, 979.27, 1000.90, 1019.66, 1028.10)
            )
          })

## ========================================================================= ##
## calc_price_summary
## ========================================================================= ##

test_that("calc_price_summary()
          works as expected", {
            r <- calc_price_summary(detailpagehtml_trockner_01)
            expect_equal(
              r %>%
                dplyr::filter(key == "price_min") %>%
                dplyr::pull(value),
              min(parse_prices(detailpagehtml_trockner_01))
            )
            expect_equal(
              r %>%
                dplyr::filter(key == "price_median") %>%
                dplyr::pull(value),
              median(parse_prices(detailpagehtml_trockner_01))
            )
            expect_equal(
              r %>%
                dplyr::filter(key == "price_2nd_min") %>%
                dplyr::pull(value),
              sort(parse_prices(detailpagehtml_trockner_01),
                   decreasing = FALSE)[2]
            )
            expect_equal(
              r %>%
                dplyr::filter(key == "price_3rd_min") %>%
                dplyr::pull(value),
              sort(parse_prices(detailpagehtml_trockner_01),
                   decreasing = FALSE)[3]
            )

            r <- calc_price_summary(detailpagehtml_nas_01)
            expect_equal(
              r[["value"]],
              c(920.290, 920.300, 920.300, 927.105)
            )
          })

## ========================================================================= ##
## parse_single_detailpage
## ========================================================================= ##

test_that("parse_single_detailpage()
          works as expected", {
            r <- parse_single_detailpage(detailpagehtml_trockner_01)
            expect_equal(
              r[["key"]],
              c(parse_keyval_tbl(detailpagehtml_trockner_01)[["key"]],
                calc_price_summary(detailpagehtml_trockner_01)[["key"]])
            )
            expect_equal(
              r[["value"]],
              c(parse_keyval_tbl(detailpagehtml_trockner_01)[["value"]],
                calc_price_summary(detailpagehtml_trockner_01)[["value"]])
            )

            r <- parse_single_detailpage(detailpagehtml_nas_01)
            expect_equal(
              r[["key"]],
              c(parse_keyval_tbl(detailpagehtml_nas_01)[["key"]],
                calc_price_summary(detailpagehtml_nas_01)[["key"]])
            )
            expect_equal(
              r[["value"]],
              c(parse_keyval_tbl(detailpagehtml_nas_01)[["value"]],
                calc_price_summary(detailpagehtml_nas_01)[["value"]])
            )
          })

## ========================================================================= ##
## fetch_all_detailpage_html
## ========================================================================= ##

detailpagehtml_list_trockner <- fetch_all_detailpage_html(
  detailpageurls_trockner
)

test_that("fetch_all_detailpage_html()
          has urls are correctly represented in returned list", {
          expect_equal(detailpagehtml_list_trockner$url,
                       detailpageurls_trockner)
          })

test_that("fetch_all_detailpage_html()
          returns same lengths for url vector and html list", {
            expect_equal(length(detailpagehtml_list_trockner$url),
                         length(detailpagehtml_list_trockner$html))
          })

test_that("fetch_all_detailpage_html()
          contents are same as individually parsed contents", {
            expect_equal(detailpagehtml_list_trockner$html[[1]],
                         detailpagehtml_trockner_01)
            expect_equal(detailpagehtml_list_trockner$html[[2]],
                         detailpagehtml_trockner_02)
          })

## ========================================================================= ##
## parse_all_detailpages
## ========================================================================= ##

test_that("parse_all_detailpages()
          returns the same content as parsing page directly", {
            r <- parse_all_detailpages(detailpagehtml_list_trockner)
            expect_equal(
              ## sorted version of contents as vector:
              sort(as.vector(t(r[1, ]))),
              ## should be equal to
              c(
                ## "url" of file added to content
                ## (at beginning, as starts with slash):
                system.file("extdata", "gh-dp-trockn-01.html",
                            package = "rgeizhals"),
                ## and then values, also sorted and converted
                ## to vector:
                sort(as.vector(
                  parse_single_detailpage(
                    detailpagehtml_trockner_01)[["value"]]
                ))
              )
            )
          })

test_that("parse_all_detailpages(),
          combines keys from two detailpages correctly", {
            r <- parse_all_detailpages(detailpagehtml_list_trockner)
            expect_equal(
              sort(setdiff(names(r), "url")),
              sort(union(
                parse_single_detailpage(detailpagehtml_trockner_01)[["key"]],
                parse_single_detailpage(detailpagehtml_trockner_02)[["key"]]
              ))
            )
          })
ingonader/rgeizhals documentation built on May 29, 2019, 3:05 a.m.