R/extract_highlight.R

Defines functions extract_html_text

globalVariables(c('text', '.', 'X1', 'X2', 'get_title'))

#' @importFrom clipr read_clip write_clip
#' @importFrom xml2 read_html
#' @importFrom rvest html_nodes html_text
#' @importFrom stringr str_trim str_length
#'
extract_html_text <- function(url, pattern) {
    xml2::read_html(url) %>%
        rvest::html_nodes(pattern) %>%
        rvest::html_text() %>%
        stringr::str_trim() %>%
        .[stringr::str_length(.) > 0] %>%
        unique()
}

#' Extract highlight texts in Wechat articles
#'
#' When we read some articles, there are too much to read it.
#' This function helps you catch the relavant to read and save your time.
#' @param url character.
#' @param pattern character.
#' @param is_paste Logical. Whether or not to paste the text on the clipboard.
#' @importFrom clipr read_clip write_clip
#' @import stringr
#' @export
#' @examples
#' extract_highlight(url = "https://mp.weixin.qq.com/s/XmnWvnMNobuF-92vbppNbQ",
#' is_paste = FALSE)
extract_highlight <- function(url = NULL, pattern = "strong, h1", is_paste = TRUE) {
    if (is.null(url)) {
        url <- clipr::read_clip()
    }
    output <-
        extract_html_text(url, pattern) %>%
        stringr::str_remove("^\\s+") %>%
        .[stringr::str_length(.) > 5] %>%
        stringr::str_subset("\u5df2\u540c\u6b65\u5230\u770b\u4e00\u770b", negate = TRUE) %>%
        stringr::str_subset(
            "\u626b\u4e00\u626b\u4e0b\u8f7d\u8ba2\u9605\u53f7\u52a9\u624b\uff0c\u7528\u624b\u673a\u53d1\u6587\u7ae0",
            negate = TRUE
        ) %>%
        stringr::str_subset("\u611f\u8c22", negate = TRUE) %>%
        stringr::str_subset("\u5728\u770b", negate = TRUE) %>%
        stringr::str_subset("\u5fae\u4fe1\u626b\u4e00\u626b", negate = TRUE) %>%
        stringr::str_flatten("\n")
    extract_print(output, is_paste = is_paste)

}

extract_print <- function(output, is_paste = TRUE) {
    if (is_paste == TRUE) {
        clipr::write_clip(output, allow_non_interactive = TRUE)
    }
    output %>% str_sub(1, 20) %>% message()
    invisible(output)
}

#' Extract first lines in Wechat articles
#'
#' When we read some articles, there are too much to read it.
#' This function helps you catch the relavant to read and save your time.
#' @param url character.
#' @param content_path character.
#' @param pattern character.
#' @param para_length integer.
#' @param is_paste Logical. Whether or not to paste the text on the clipboard.
#' @importFrom clipr read_clip write_clip
#' @import stringr
#' @importFrom readr read_lines
#' @export
#' @examples
#' extract_firstline(url = "https://mp.weixin.qq.com/s/XmnWvnMNobuF-92vbppNbQ",
#' is_paste = FALSE)
extract_firstline <-
    function(url = NULL,
             content_path = NULL,
             pattern = "p",
             para_length = 1,
             is_paste = TRUE) {
        if (is.null(url)) {
            url <- clipr::read_clip()
        }
        if (is.null(content_path)) {
            content = extract_html_text(url, pattern)
        } else {
            content = readr::read_lines(content_path)
        }
        output <-
            content %>%
            .[stringr::str_count(., "\u3002") > para_length] %>%
            stringr::str_extract("^[^\u3002]+\u3002") %>%
            stringr::str_flatten("\n")
        extract_print(output, is_paste = is_paste)

    }
JiaxiangBU/add2md documentation built on Jan. 31, 2020, 7:46 p.m.