#' Download the URL List of the zsl Data
#'
#' Check the website and scape the urls and other attibutes, then form the
#' result data.frame containing the infomation for the further use.
#'
#' This function mostly use the \code{xml2} package to get information from the
#' website. The final data.frame contains the announcement date, exchange,
#' file name and url of the file. There is only one param \code{url}, which is
#' used to be checked and get the corresponding info. The zsl list is in several
#' pages (now nearly 150 pages), the default \code{url} is the first page where
#' the newest files' urls can be get.
#'
#' The file name contained in the result data.frame is strictly organized. That
#' is "date + exchange + .xls". For the further use, this is the basic rule.
#'
#' @param url the url to be parsed and get the information
#'
#' @return A data.frame contains the announcement date, exchange,
#' file name and url of the file.
#' @export
zslDownList <- function(url) {
# 本函数利用xml2包爬取中证登的折算率文件网站,返回下载文件列表
# 返回的列表包括了交易所,文件日期,文件链接,下载文件名
# 读取url,url可以给定,以用于未来循环读取多个页面
if (missing(url))
url <- 'http://www.chinaclear.cn/zdjs/xbzzsl/center_flist.shtml'
xl <- xml2::read_html(url)
# 设置文件名列表的XPath和日期的XPath并读取Node
# XPath设置查找含有“折算率的通知”的链接节点(<a>)
xpath <- "//div[@class='pageTabContent']/ul/li/a[contains(@title, '\u6298\u7b97\u7387\u7684\u901a\u77e5')]"
xpathd <- "//div[@class='pageTabContent']/ul/li/span[@class='time']"
li <- xml2::xml_find_all(xl, xpath)
ind1 <- stringr::str_detect(xml2::xml_attr(li, 'href'), 'xls$')
ind2 <- stringr::str_detect(xml2::xml_attr(li, 'href'), 'shtml$')
li <- li[ind1 | ind2]
title <- xml2::xml_text(li, trim=TRUE)
# 获取日期
annce <- lubridate::ymd(stringr::str_extract(title, '[[:digit:]]{8}'))
if (anyNA(annce)) {
index <- which(is.na(annce))
annce[index] <- xml2::xml_text(xml2::xml_find_all(xl, xpathd)[index])
}
# 获取文件链接,其中上交所的没有给出前缀,需要补充
href <- as.character(sapply(xml2::xml_attr(li, 'href'), function(x) {
if (!stringr::str_detect(x, '^http')) {
hrf <- paste('http://www.chinaclear.cn', x, sep='')
if (stringr::str_detect(hrf, 'shtml$')) {
hrf <- stringr::str_replace(hrf, stringr::fixed('../..'), '')
}
return(hrf)
} else {
return(x)
}
}))
href <- stringr::str_replace(href, ' ', '%20')
# 根据title,判断交易所
exchange <- as.character(sapply(title, function(x) {
if (stringr::str_detect(x, '\u6df1\u5733')) return('SZ')
if (stringr::str_detect(x, '\u6df1\u8bc1')) return('SZ')
if (stringr::str_detect(x, '\u4e0a\u6d77')) return('SH')
}))
# 用日期和交易所组合成文件名
suffix <- stringr::str_extract(href, '[^.]{3,5}$')
fn <- paste(format(annce, format='%Y%m%d'), exchange, '.', suffix, sep='')
# 生成并返回完整列表
dl_list <- data.frame(annce, exchange, fn, href, stringsAsFactors=FALSE)
return(dl_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.