#!/usr/bin/env Rscript
args <- commandArgs(trailingOnly = TRUE)
arg <- args[1]
`%>%` <- dplyr::`%>%`
if (dir.exists("~/databases/lelivros/")) {
setwd("~/databases/lelivros/")
} else {
## This step runs only once
if (!dir.exists("~/databases")) {
dir.create("~/databases")
}
dir.create("~/databases/lelivros/")
setwd("~/databases/lelivros")
dir.create("data")
}
# Collect ----------------------------------------------------------------------
if (arg == "collect") {
url <- "http://lelivros.love/page/"
url.todas <- c()
i <- 1
## 624
while (TRUE) {
h <- xml2::read_html(paste0(url, i), options = "NOERROR")
url.nova <- h %>%
rvest::html_nodes(xpath = "//div//li/a") %>%
rvest::html_attr("href") %>%
.[stringr::str_detect(., "lelivros.love/book")] %>%
.[!is.na(.)]
url.nova <- url.nova[!duplicated(url.nova)]
# url.nova <- url.nova[!(url.nova %in% url.coletada)]
url.todas <- c(url.todas, url.nova)
if (length(url.nova) == 0) break
i <- i + 1
}
url.todas <- url.todas[!duplicated(url.todas)]
url <- dplyr::tibble(url = url.todas,
dt.url = Sys.Date(),
collect = 0,
dt.collect = NA
)
saveRDS(url , "url.RData")
} else if (arg == "dwd") {
url <- readRDS("url.RData")
for (w in 1:nrow(url)) {
url.dwd <- xml2::read_html(url[w, 1][[1]]) %>%
rvest::html_nodes(xpath = "//div/div[@class='links-download']/a") %>%
rvest::html_attr("href")
## Improve here
## Pagar 7 dias
url.dwd <- url.dwd[grepl(".epub$|.pdf$|.mobi$", url.dwd)]
txt <- read_html(url[w]) %>%
html_nodes(xpath = "//div//h1[@class='product_title entry-title']") %>%
html_text() %>%
strsplit("–") %>%
.[[1]] %>%
trimws()
txt <- gsub(" ", "_", paste0(txt, collapse = "__"))
if (dir.exists(txt)) {
# m$find(paste0('{"url": "', url[w], '"}'))
m$update(paste0('{"url":"', url[w], '"}'),
'{"$set": {"collected": 1}}')
next
} else {
# Cria diretorio
dir.create(txt)
# Baixa livros nos três formatos disponiveis
suppressMessages(mapply(u = url.dwd, ext = c(str_extract(url.dwd, ".epub|.pdf|.mobi")),
function(u, ext) {
r <- try(download.file(
url = u,
destfile = paste0(txt, "/book", ext),
quiet = TRUE
),
silent = TRUE)
if (class(r) == "try-error") {
m$update(paste0('{"url":"', url[w], '"}'),
'{"$set": {"collected": 2}}')
}
Sys.sleep(2)
}))
if (file.exists(str_c(txt, "/book.pdf"))) {
# Se houver problemas para ler o pdf marca como inválido
suppressMessages(x <-
try(pdftools::pdf_info(paste0(txt, "/book.pdf")),
silent = TRUE)
)
if (class(x) == "try-error") {
m$update(paste0('{"url":"', url[w], '"}'),
'{"$set": {"collected": -1}}')
unlink(txt, recursive = TRUE)
}
}
}
}
}
if (arg == 3) {
todos.dir <- list.files()
index.dir <- sapply(todos.dir, function(x) length(list.files(x)))
todos.dir <- names(index.dir[which(index.dir == 0)])
unlink(todos.dir,recursive = TRUE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.