#' search chengyu including a word
#'
#' input a word, return the all chengyu including this word.
#' @author lgm
#' @param word
#' @param showExplanation: TRUE will return them with explanations
#' @return chengyus
#' @export
#' @examples
#'# search a word without explanations
#'search_chengyu_online("名")
#'
#'# search a word with explanations
#'search_chengyu_online("民",showExplanation = FALSE)
#'
#'# search the exact words
#'search_chengyu_online("百代文宗")
#'
#'#return only chengyu with pinyin using local data
#'search_chengyu_online("李",pinyin=TRUE)
search_chengyu_online <- function(word, showExplanation= TRUE,pinyin=FALSE){
front_url <- URLdecode("http%3A%2F%2Fcy.5156edu.com%2Fserach.php%3Ff_key%3D")
back_url <- "&f_type=chengyu&f_type2="
# using a py script to quote chinese words in gbk into url
path <- paste(system.file(package = "gmdata"),"make-url.py",sep="/")
cmd <- paste("/anaconda3/bin/python", path, word)
word_gbk <- system(command = cmd, intern = TRUE)
url <- paste0(front_url, word_gbk, back_url)
# scrape the page by searching
suppressWarnings(suppressPackageStartupMessages({
library(dplyr)
library(stringr)
library(purrr)
library(purrrlyr) #by_row()
library(rvest)
library(magrittr)
library(tidyverse)
}))
url %>%
html_session %>%
read_html(encoding="gbk") %>%
html_nodes(".font_15") %>%
html_table -> tb
stack_one_col <- function(df) {
# library(tidyverse)
# df %>%
# by_row(~as.data.frame(t(as.matrix(.x)))) %>%
# bind_rows(.) %>%
# .$.out %>%
# bind_rows() -> res
df %>%
by_row(~paste(.x[,1],.x[,2],sep="::")) %>%
.$.out %>%
unlist %>%
.[-1] -> res
return(res)
}
if (showExplanation==TRUE && pinyin==FALSE){
return(stack_one_col(tb[[1]]))
} else if (pinyin==TRUE){
# cypinyin.rda is a local data
py <- grep(word,cypinyin,value = TRUE)
return(py)
} else {
return(tb[[1]]$X1)
}
}
# grep("我",cypinyin,value = TRUE) %>% gsub("[一-龥]*","",.)
# search_chengyu_online("李")
# save chengyu explanations into local disk
# just scrape chengyu explanations
# scrape <- function(cy){
# wc=regmatches(cy,regexpr("[一-龥]*",cy))
# py=trimws(gsub("[一-龥]*","",cy),"left")
# ex=search_chengyu_online(wc) %>% gsub("[一-龥]*::","",.)
# data = paste0(wc,"|",py,"|",ex)
# cat(data,file="inst/chengudata.txt",sep="\n",append=TRUE)
# print(paste(wc,"is scraped."))
# }
#
# # parallel scraping
# #suppressPackageStartupMessages(library(doParallel))
# #registerDoParallel(cores=4)
#
# #system.time(foreach(i=1:length(cypinyin),.packages = c("tidyverse","rvest")) %dopar%
# # {scrape(cypinyin[i])})
#
# system.time(cypinyin[1:1000] %>% map(~scrape(.x)))
#
#
#
# tidy_dir()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.