tests/lelivros.R

#!/usr/bin/env Rscript
args <- commandArgs(trailingOnly = TRUE)
arg <- args[1]

`%>%` <- dplyr::`%>%`

if (dir.exists("~/databases/lelivros/")) {
    setwd("~/databases/lelivros/")    
} else {
    ## This step runs only once
    if (!dir.exists("~/databases")) {
        dir.create("~/databases")
    }

    dir.create("~/databases/lelivros/")
    setwd("~/databases/lelivros")

    dir.create("data")
}

# Collect ----------------------------------------------------------------------
if (arg == "collect") {

    url <- "http://lelivros.love/page/"
    url.todas <- c()
    i <- 1

    ## 624
    while (TRUE) {
        h <- xml2::read_html(paste0(url, i), options = "NOERROR")

        url.nova <- h %>%
            rvest::html_nodes(xpath = "//div//li/a") %>%
            rvest::html_attr("href") %>%
            .[stringr::str_detect(., "lelivros.love/book")] %>%
            .[!is.na(.)]

        url.nova <- url.nova[!duplicated(url.nova)]
        # url.nova <- url.nova[!(url.nova %in% url.coletada)]

        url.todas <- c(url.todas, url.nova)

        if (length(url.nova) == 0) break

        i <- i + 1
    }

    url.todas <- url.todas[!duplicated(url.todas)]

    url <- dplyr::tibble(url = url.todas,
                         dt.url = Sys.Date(),
                         collect = 0, 
                         dt.collect = NA
                         )

    saveRDS(url , "url.RData")
    
} else  if (arg == "dwd") {
    url <- readRDS("url.RData")

    for (w in 1:nrow(url)) {
        url.dwd <- xml2::read_html(url[w, 1][[1]]) %>%
            rvest::html_nodes(xpath = "//div/div[@class='links-download']/a") %>%
            rvest::html_attr("href")

        ## Improve here
        ## Pagar 7 dias
        url.dwd <- url.dwd[grepl(".epub$|.pdf$|.mobi$", url.dwd)]

        txt <- read_html(url[w]) %>%
            html_nodes(xpath = "//div//h1[@class='product_title entry-title']") %>%
            html_text() %>%
            strsplit("–") %>%
            .[[1]] %>%
            trimws()

        txt <- gsub(" ", "_", paste0(txt, collapse = "__"))

        if (dir.exists(txt)) {
            # m$find(paste0('{"url": "', url[w], '"}'))

            m$update(paste0('{"url":"', url[w], '"}'),
                     '{"$set": {"collected": 1}}')
            next

        } else {
            # Cria diretorio
            dir.create(txt)

            # Baixa livros nos três formatos disponiveis
            suppressMessages(mapply(u = url.dwd, ext = c(str_extract(url.dwd, ".epub|.pdf|.mobi")),
                                    function(u, ext) {
                                        r <- try(download.file(
                                            url = u,
                                            destfile = paste0(txt, "/book", ext),
                                            quiet = TRUE
                                        ),
                                        silent = TRUE)

                                        if (class(r) == "try-error") {
                                            m$update(paste0('{"url":"', url[w], '"}'),
                                                     '{"$set": {"collected": 2}}')
                                        }

                                        Sys.sleep(2)
                                    }))


            if (file.exists(str_c(txt, "/book.pdf"))) {
                # Se houver problemas para ler o pdf marca como inválido
                suppressMessages(x <-
                                     try(pdftools::pdf_info(paste0(txt, "/book.pdf")),
                                         silent = TRUE)
                )
                if (class(x) == "try-error") {
                    m$update(paste0('{"url":"', url[w], '"}'),
                             '{"$set": {"collected": -1}}')
                    unlink(txt, recursive = TRUE)
                }
            }
        }
    }
}

if (arg == 3) {
    todos.dir <- list.files()
    index.dir <- sapply(todos.dir, function(x) length(list.files(x)))
    todos.dir <- names(index.dir[which(index.dir == 0)])
    unlink(todos.dir,recursive = TRUE)
}
Andryas/WEBDATA documentation built on Jan. 2, 2020, 1:31 p.m.