gmdata: My some useful functions for data tools.

Documented in search_word_online

# -*- coding: utf-8 -*-
#' search words in online dict
#'
#' input a word you want to search,and return it's expanation and
#' relative words.
#' @author lgm
#' @param word: the word you want to search
#' @param toggles: (1)no toggles, basic search without return; (2)`ret=TRUE`, basic search with return; (3) `more=TRUE`, show more explanation without return; (4) `more=TRUE, ret=TRUE`, return more explanation; (5) `relative=TRUE`, show and return the relative words.
#' @return explanations
#' @export
#' @examples
#' # basic search,just cat the result
#' search_word_online("璇")
#'
#'# basic search and return the result
#' search_word_online("璇",ret = TRUE)
#'
#'# more search but no return
#'search_word_online("璇",more = TRUE)
#'
#' # more search but with return
#'search_word_online("璇",more = TRUE,ret = TRUE)
#'
#'#relative word search with return
#'search_word_online("稔",relative = TRUE)
#'
#'#search words in internal data `pyword`
#'pyword[4,2] %>% str_split(pattern="") %>% .[[1]] %>% map(~search_word_online(.x)) %>% .[[1]]
#'
#'#more lines
#lapply(4:6,function(x) {pyword[x,2] %>% str_split(pattern="") %>% .[[1]] %>% map(~search_word_online(.x)) %>% .[[1]]})

search_word_online <- function(word, basic=TRUE,more=FALSE,ret=FALSE,relative=FALSE){

	# basic urls
	# URLencode(front_url_orig,reserved = TRUE)
	front_url <- URLdecode("http%3A%2F%2Fxh.5156edu.com%2Findex.php%3Ff_key%3D")
	back_url <- "&f_type=zi&SearchString.x=0&SearchString.y=0"

	# using a py script to quote chinese words in gbk into url
	path <- paste(system.file(package = "gmdata"),"make-url.py",sep="/")
	cmd <- paste("/anaconda3/bin/python", path, word)
	word_gbk <- system(command = cmd, intern = TRUE)
	url <- paste0(front_url, word_gbk, back_url)

	# scrape the page by searching
	suppressWarnings(suppressPackageStartupMessages({
		library(dplyr)
		library(stringr)
		library(rvest)
		}))

	url %>%
		html_session %>%
		read_html(encoding="gbk") %>%
		html_nodes(".font_18") %>%
		str_replace_all("<br>","\n")  %>%
		read_html(encoding="gbk")%>%
		html_text  -> pg

	# print the results conditional your choices
	bsmean <- regmatches(pg,regexpr("基本解释：(\n)*(.*\n*)*详细解释",pg)) %>%
		str_replace("详细解释","")
	moremean <- regmatches(pg,regexpr("详细解释：(\n)*(.*\n*)*相关词语",pg))
	rel <- regmatches(pg,regexpr("相关词语：(\n)*(.*\n*)*更多有关",pg))

	if ( basic==TRUE && more==FALSE && ret==FALSE && relative==FALSE){
		cat(bsmean)
	} else if (basic==TRUE && ret==TRUE && more==FALSE){
		return(bsmean)
	}	else if (more==TRUE && ret== FALSE){
		cat(moremean)
	} else if ( more==TRUE && ret== TRUE){
		return(moremean)
	} else if (relative==TRUE){
		rel <- gsub("\n"," ",rel) %>%
			gsub("相关词语：\r\n?  ","",.) %>%
			gsub(" \n?更多有关","",.) %>%
			gsub("    ",",",.) %>%
			str_trim(side="both")
		return(rel)
	}

}