Nothing
.lay_low <- function(){
Sys.sleep(sample(seq(1, 2, by=0.001), 1))
}
.gen_remote_driver <- function(...) {
driver <- RSelenium::rsDriver(...)
remote_driver <- driver[["client"]]
remote_driver$navigate("https://lihkg.com")
return(list(driver, remote_driver))
}
.crack_it <- function(url, remote_driver){
remote_driver$navigate(url)
Sys.sleep(sample(seq(3, 5, by=0.001), 1))
collapsed <- remote_driver$findElements("xpath", "//div[@class='_1d3Z5jQRq3WnuIm0hnMh0c']")
if (length(collapsed)) { # click collapsed comments if any
for (x in collapsed) {
x$clickElement()
}
}
html <- remote_driver$getPageSource()
if(grepl("recaptcha_widget", html[[1]])){
readline(prompt="Captcha Detected. Press [enter] to continue after solving")
return(.crack_it(url, remote_driver)) # make sure collapsed comments are expanded
}
pg <- xml2::read_html(html[[1]])
if (length(collapsed)) { # remove any additional page accidentally loaded when clicking collapsed comments
xml2::xml_remove(xml2::xml_find_all(pg, "//div[@class='_3jxQCFWg9LDtkSkIVLzQ8L']")[-1])
}
return(pg)
}
#' @importFrom magrittr %>%
.scrape_page <- function(html, postid){
##get_number
number <- html %>% rvest::html_nodes("._36ZEkSvpdj_igmog0nluzh") %>%
rvest::html_node("div div small ._3SqN3KZ8m8vCsD9FNcxcki") %>%
rvest::html_text()
##get_date
date <- html %>% rvest::html_nodes("._36ZEkSvpdj_igmog0nluzh") %>%
rvest::html_node("div div small .Ahi80YgykKo22njTSCzs_") %>%
rvest::html_attr("data-tip")
##get_uid
uid <- html %>% rvest::html_nodes("._36ZEkSvpdj_igmog0nluzh") %>%
rvest::html_node("div div small .ZZtOrmcIRcvdpnW09DzFk a") %>%
rvest::html_attr('href')
##get_probation
probation <- html %>% rvest::html_nodes("._36ZEkSvpdj_igmog0nluzh") %>%
rvest::html_node("div div small ._10ZVxePYNpBeLjzkQ88wtj") %>%
rvest::html_text() %>%
is.na() %>%
magrittr::not()
##get_text
text <- html %>% rvest::html_nodes("._36ZEkSvpdj_igmog0nluzh") %>%
rvest::html_node("div div .GAagiRXJU88Nul1M7Ai0H ._2cNsJna0_hV8tdMj3X6_gJ") %>%
rvest::html_text()
##get_upvote
upvote <- html %>% rvest::html_nodes("._36ZEkSvpdj_igmog0nluzh") %>%
rvest::html_node("._1jvTHwVJobs9nsM0JDYqKB+ ._1drI9FJC8tyquOpz5QRaqf") %>% rvest::html_text()
##get_downvote
downvote <- html %>% rvest::html_nodes("._36ZEkSvpdj_igmog0nluzh") %>%
rvest::html_node("._2_VFV1QOZok8YhOTGa_3h9+ ._1drI9FJC8tyquOpz5QRaqf") %>% rvest::html_text()
##get_collection_time
collection_time <- Sys.time()
##get_title
top.text <- html %>% rvest::html_nodes("._2k_IfadJWjcLJlSKkz_R2- span") %>% rvest::html_text()
title <- top.text[2]
board <- top.text[1]
newdf <- tibble::as_tibble(cbind(number, date, uid, probation, text, upvote, downvote))
newdf$postid <- postid # This bit might fail if date etc is NULL
newdf$title <- title
newdf$board <- board
newdf$collection_time <- collection_time
return(newdf)
}
.print_v <- function(..., verbose) {
if (verbose) {
print(...)
}
}
#' @importFrom magrittr %>%
.scrape_post <- function(postid, remote_driver, verbose) {
posts <- tibble::tibble()
for(i in 1:999){
attempt <- 1
notdone <- TRUE
nextpage <- FALSE
while( notdone && attempt <= 4 ) { # Auto restart when fails
.print_v(paste0("Attempt: ", attempt), verbose = verbose)
attempt <- attempt + 1
try({
html <- .crack_it(paste0("https://lihkg.com/thread/", postid, "/page/", i), remote_driver)
next.page <- html %>% rvest::html_node("._3omJTNzI7U7MErH1Cfr3gE+ ._3omJTNzI7U7MErH1Cfr3gE a") %>% rvest::html_text()
titlewords <- html %>% rvest::html_nodes("._2k_IfadJWjcLJlSKkz_R2- span") %>% rvest::html_text() %>% length()
if ("\u4e0b\u4e00\u9801" %in% next.page){
.print_v(paste0("page ", i, " (to be continued)"), verbose = verbose)
post <- .scrape_page(html, postid)
posts <- dplyr::bind_rows(posts, post)
nextpage <- TRUE
notdone <- FALSE
} else if (titlewords == 1){
notdone <- FALSE
posts <- tibble::tibble(number = "ERROR", date = "ERROR", uid = "ERROR", probation = "ERROR", text = "ERROR", upvote = "ERROR", downvote = "ERROR", postid = postid, title = "Deleted Post", board = "ERROR", collection_time = Sys.time())
.print_v("Empty Post, Skipping", verbose = verbose)
} else {
.print_v(paste0("page ", i, " (last page)"), verbose = verbose)
post <- .scrape_page(html, postid)
posts <- dplyr::bind_rows(posts, post)
notdone <- FALSE
}
.lay_low()
})
} # End of While Loop
if( notdone && attempt > 4 ){
if (titlewords == 2 && nrow(posts) > 1){
warning <- tibble::tibble(number = "EMPTY LAST PAGE", date = "EMPTY LAST PAGE", uid = "ERROR", probation = "ERROR", text = "ERROR", upvote = "ERROR", downvote = "ERROR", postid = postid, title = "Deleted Last Page", board = "ERROR", collection_time = Sys.time())
posts <- dplyr::bind_rows(posts, warning)
.print_v("Empty Last Page Detected", verbose = verbose)
notdone <- FALSE
} else {
stop("Error, Stopping")
}
}
if(nextpage){
next
} else if(!notdone){
break
}
} # End of For Loop
return(posts)
}
Lihkg_reader <- R6::R6Class(
"lihkg_reader",
public = list(
initialize = function(..., verbose = TRUE) {
res <- .gen_remote_driver(...)
private$remote_driver <- res[[2]]
private$driver <- res[[1]]
private$verbose <- verbose
},
save = function(file_name) {
saveRDS(self$bag, file_name)
},
scrape = function(postids) {
res <- purrr::map(postids, purrr::safely(.scrape_post), remote_driver = private$remote_driver, verbose = private$verbose)
failed_ids <- postids[!purrr::map_lgl(purrr::map(res, "error"), is.null)]
if (length(failed_ids) >= 1) {
private$failed <- append(private$failed, failed_ids)
}
self$bag <- dplyr::bind_rows(self$bag, purrr::map_df(res, "result"))
},
retry = function() {
if (length(private$failed) == 0) {
stop("No failed post id.")
}
self$scrape_alot(private$failed)
},
clear = function() {
self$bag <- tibble::tibble()
},
finalize = function() {
private$remote_driver$close()
private$driver[["server"]]$stop()
},
bag = tibble::tibble()
),
private = list(
remote_driver = NULL,
driver = NULL,
failed = c(),
verbose = NULL
)
)
#' Create a lihkg scraper
#'
#' This function creates a lihkg scraper
#'
#' @param ... parameters to be passed to RSelenium::rsDriver(). For example, you can use create_lihkg(browser = "firefox", port = sample(10000:60000, 1)) to generate a Selenium instance of Firefox.
#' @param verbose logical, whether debug information is printed out.
#' @return a lihkg scraper
#' @examples
#' \dontrun{
#' lihkg <- create_lihkg(browser = "firefox", port = sample(10000:60000, 1))
#' }
#' @export
create_lihkg <- function(..., verbose = FALSE) {
Lihkg_reader$new(..., verbose = FALSE)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.