R/search_chengyu_online.R

Defines functions search_chengyu_online

Documented in search_chengyu_online

#' search chengyu including a word
#'
#' input a word, return the all chengyu including this word.
#' @author lgm
#' @param word
#' @param showExplanation:  TRUE will return them with explanations
#' @return chengyus
#' @export
#' @examples
#'# search a word without explanations
#'search_chengyu_online("名")
#'
#'# search a word with explanations
#'search_chengyu_online("民",showExplanation = FALSE)
#'
#'# search the exact words
#'search_chengyu_online("百代文宗")
#'
#'#return only chengyu with pinyin using local data
#'search_chengyu_online("李",pinyin=TRUE)

search_chengyu_online <- function(word, showExplanation= TRUE,pinyin=FALSE){
	front_url <- URLdecode("http%3A%2F%2Fcy.5156edu.com%2Fserach.php%3Ff_key%3D")
	back_url <- "&f_type=chengyu&f_type2="

	# using a py script to quote chinese words in gbk into url
	path <- paste(system.file(package = "gmdata"),"make-url.py",sep="/")
	cmd <- paste("/anaconda3/bin/python", path, word)
	word_gbk <- system(command = cmd, intern = TRUE)
	url <- paste0(front_url, word_gbk, back_url)

	# scrape the page by searching
	suppressWarnings(suppressPackageStartupMessages({
		library(dplyr)
		library(stringr)
		library(purrr)
		library(purrrlyr) #by_row()
		library(rvest)
		library(magrittr)
		library(tidyverse)
	}))

	url %>%
		html_session %>%
		read_html(encoding="gbk") %>%
		html_nodes(".font_15") %>%
		html_table -> tb

	stack_one_col <- function(df) {
		# library(tidyverse)
		# df %>%
		# 	by_row(~as.data.frame(t(as.matrix(.x)))) %>%
		# 	bind_rows(.) %>%
		# 	.$.out %>%
		# 	bind_rows() -> res
		df %>%
			by_row(~paste(.x[,1],.x[,2],sep="::")) %>%
			.$.out %>%
			unlist %>%
			.[-1] -> res
		return(res)
	}

	if (showExplanation==TRUE && pinyin==FALSE){
		return(stack_one_col(tb[[1]]))
	} else if (pinyin==TRUE){
		# cypinyin.rda is a local data
		py <- grep(word,cypinyin,value = TRUE)
		return(py)
	}	else {
		return(tb[[1]]$X1)
	}

}

# grep("我",cypinyin,value = TRUE) %>% gsub("[一-龥]*","",.)
# search_chengyu_online("李")

# save chengyu explanations into local disk

# just scrape chengyu explanations
# scrape <- function(cy){
#   wc=regmatches(cy,regexpr("[一-龥]*",cy))
#   py=trimws(gsub("[一-龥]*","",cy),"left")
# 	ex=search_chengyu_online(wc) %>% gsub("[一-龥]*::","",.)
# 	data = paste0(wc,"|",py,"|",ex)
# 	cat(data,file="inst/chengudata.txt",sep="\n",append=TRUE)
# 	print(paste(wc,"is scraped."))
# }
#
# # parallel scraping
# #suppressPackageStartupMessages(library(doParallel))
# #registerDoParallel(cores=4)
#
# #system.time(foreach(i=1:length(cypinyin),.packages = c("tidyverse","rvest")) %dopar%
# #  {scrape(cypinyin[i])})
#
# system.time(cypinyin[1:1000] %>% map(~scrape(.x)))
#
#
#
# tidy_dir()
Gabegit/gmdata documentation built on May 6, 2019, 5:32 p.m.