getWebPage <- function(url, encoding = "UTF-8") {
con <- curl::curl(url)
out <- con %>%
readLines() %>%
stringr::str_conv(encoding = encoding) %>%
magrittr::extract(8)
close(con)
out
}
getTagLoc <- function(text, tag_head, tag_tail = NA) {
head_df <- text %>%
stringr::str_locate_all(pattern = tag_head) %>%
.[[1]] %>% tibble::as_tibble() %>%
dplyr::mutate(tag = "head",
cnt = dplyr::row_number())
if (is.na(tag_tail)) return(head_df)
else {
tail_df <- text %>%
stringr::str_locate_all(pattern = tag_tail) %>%
.[[1]] %>% tibble::as_tibble() %>%
dplyr::mutate(tag = "tail",
cnt = dplyr::row_number())
}
dplyr::bind_rows(head_df, tail_df) %>%
dplyr::arrange(start)
}
getTagRange <- function(text, tag_head, tag_tail = NA, cnt_head = 1, cnt_tail = 1) {
tagloc <- getTagLoc(text = text, tag_head = tag_head, tag_tail = tag_tail)
txt_start <- tagloc %>%
dplyr::filter(tag == "head", cnt == cnt_head) %>%
dplyr::pull(start)
txt_end <- dplyr::if_else(any(stringr::str_detect(tagloc$tag, "tail")),
tagloc %>% dplyr::filter(tag == "tail", cnt == cnt_tail) %>% dplyr::pull(end),
stringr::str_length(text))
c(txt_start, txt_end)
}
getTableTitle <- function(x) {
tagrange <- getTagRange(text = x, tag_head = "<font", tag_tail = "</font>", cnt_head = 1, cnt_tail = 1)
out <- x %>%
stringr::str_sub(tagrange[1], tagrange[2]) %>%
stringr::str_remove_all(pattern = "<.*?>") %>%
stringr::str_trim()
out
}
getDataDate <- function(x) {
tagrange <- getTagRange(text = x, tag_head = "<center>", tag_tail = "</center>", cnt_head = 1, cnt_tail = 1)
out <- x %>%
stringr::str_sub(tagrange[1], tagrange[2]) %>%
stringr::str_remove_all(pattern = "<.*?>|最近更新日期:") %>%
stringr::str_trim()
out
}
getTableText <- function(x) {
tagrange <- x %>%
getTagLoc("<tr", "</tr>") %>%
{c(dplyr::first(.$start), dplyr::last(.$end))}
out <- x %>%
stringr::str_sub(tagrange[1], tagrange[2])
out
}
tidy01 <- function(df) {
df %>%
dplyr::slice(-1) %>%
tidyr::separate(col = var1, into = c("var1_1", "var1_2"), sep = " ") %>%
dplyr::mutate(var3 = lubridate::ymd(var3) %>% as.character()) %>%
dplyr::select(sort(names(.)))
}
tidy02 <- function(df) {
df %>%
dplyr::slice(-1) %>%
dplyr::mutate(var0 = ifelse(var2 == "", var1, NA_character_)) %>%
tidyr::fill(var0) %>%
dplyr::filter(var0 != var1) %>%
tidyr::separate(col = var1, into = c("var1_1", "var1_2"), sep = " ") %>%
dplyr::mutate(var3 = lubridate::ymd(var3) %>% as.character()) %>%
dplyr::select(sort(names(.)))
}
tidy03 <- function(df) {
df %>%
dplyr::slice(-1) %>%
dplyr::mutate(var0 = dplyr::if_else(var2 == "", var1, NA_character_)) %>%
tidyr::fill(var0) %>%
dplyr::filter(var0 != var1) %>%
tidyr::separate(col = var1, into = c("var1_1", "var1_2"), sep = " ") %>%
dplyr::mutate(var3 = lubridate::ymd(var3) %>% as.character(),
var4 = dplyr::if_else(var4 == "無到期日", "9999/12/31", var4) %>%
lubridate::ymd() %>% as.character()) %>%
dplyr::select(sort(names(.)))
}
tidy04 <- function(df) {
df %>%
dplyr::slice(-1) %>%
dplyr::mutate(var0 = dplyr::if_else(var2 == "", var1, NA_character_)) %>%
tidyr::fill(var0) %>%
dplyr::filter(var0 != var1) %>%
tidyr::separate(col = var1, into = c("var1_1", "var1_2"), sep = " ") %>%
dplyr::mutate(var3 = lubridate::ymd(var3) %>% as.character()) %>%
dplyr::select(sort(names(.)))
}
tidy05 <- function(df) {
df %>%
dplyr::slice(-1) %>%
dplyr::mutate(var0 = dplyr::if_else(var2 == "", var1, NA_character_)) %>%
tidyr::fill(var0) %>%
dplyr::filter(var0 != var1) %>%
tidyr::separate(col = var1, into = c("var1_1", "var1_2"), sep = " ") %>%
dplyr::mutate(var3 = lubridate::ymd(var3) %>% as.character()) %>%
dplyr::select(sort(names(.)))
}
tidy06 <- function(df) {
df %>%
dplyr::slice(-1) %>%
dplyr::mutate(var0 = dplyr::if_else(var2 == "", var1, NA_character_)) %>%
tidyr::fill(var0) %>%
dplyr::filter(var0 != var1) %>%
tidyr::separate(col = var1, into = c("var1_1", "var1_2"), sep = " ") %>%
dplyr::mutate(var3 = lubridate::ymd(var3) %>% as.character()) %>%
dplyr::select(sort(names(.)))
}
tidy07 <- function(df) {
df %>%
dplyr::slice(-1) %>%
tidyr::separate(col = var1, into = c("var1_1", "var1_2"), sep = " ") %>%
dplyr::mutate(var3 = lubridate::ymd(var3) %>% as.character()) %>%
dplyr::select(sort(names(.)))
}
tidy08 <- function(df) {
df %>%
dplyr::slice(-1) %>%
dplyr::mutate(var0 = dplyr::if_else(var2 == "", var1, NA_character_)) %>%
tidyr::fill(var0) %>%
dplyr::filter(var0 != var1) %>%
tidyr::separate(col = var1, into = c("var1_1", "var1_2"), sep = " ") %>%
dplyr::mutate(var3 = lubridate::ymd(var3) %>% as.character()) %>%
dplyr::select(sort(names(.)))
}
tidy09 <- function(df) {
df %>%
dplyr::slice(-1) %>%
tidyr::separate(col = var1, into = c("var1_1", "var1_2"), sep = " ") %>%
dplyr::mutate(var3 = lubridate::ymd(var3) %>% as.character()) %>%
dplyr::select(sort(names(.)))
}
tidy10 <- function(df) {
df %>%
dplyr::slice(-1) %>%
dplyr::mutate(var3 = lubridate::ymd(var3) %>% as.character()) %>%
dplyr::select(sort(names(.)))
}
tidy11 <- function(df) {
df %>%
dplyr::slice(-1) %>%
tidyr::separate(col = var1, into = c("var1_1", "var1_2"), sep = " ") %>%
dplyr::mutate(var3 = lubridate::ymd(var3) %>% as.character()) %>%
dplyr::select(sort(names(.)))
}
########################################
# process data within a function
# ======================================
getAllSec <- function() {
d <- list()
d$pageno <- 1:11
d$url <- stringr::str_c("https://isin.twse.com.tw/isin/C_public.jsp?strMode=", d$pageno)
d$htmtxt <- purrr::map_chr(d$url, ~ getWebPage(.x, "ms950"))
d$title <- purrr::map_chr(d$htmtxt, ~ getTableTitle(.x))
d$dated <- purrr::map_chr(d$htmtxt, ~ getDataDate(.x)) %>% lubridate::ymd() %>% as.character()
d$tbltxt <- purrr::map_chr(d$htmtxt, ~ getTableText(.x))
d$rows <- purrr::map(
.x = d$tbltxt,
.f = ~ stringr::str_split(string = .x, pattern = "</tr><tr>") %>%
magrittr::extract2(1))
d$coln <- purrr::map_int(
.x = d$tbltxt,
.f = ~ .x %>%
stringr::str_split_fixed(pattern = "</tr><tr>", n = 2) %>%
magrittr::extract2(1) %>%
stringr::str_count(pattern = "</td>"))
d$colv <- purrr::map(d$coln, ~ stringr::str_c("var", 1:(.x)))
d$data <- purrr::pmap(
.l = list(d$rows, d$coln, d$colv),
.f = ~ stringr::str_split_fixed(string = ..1, pattern = "</td>", n = ..2) %>%
`colnames<-`(..3) %>%
tibble::as_tibble() %>%
dplyr::mutate_all(.funs = ~ stringr::str_remove_all(., pattern = "<.*?>")) %>%
dplyr::mutate_all(.funs = ~ stringr::str_trim(., side = "both")))
# d$colcn <- purrr::map(
# .x = d$data,
# .f = ~ dplyr::slice(.x, 1) %>% unlist())
d$colen <- list(
names_01 = c("sec_symbol", "sec_name", "isin_code", "start_date", "sector", "cfi_code", "remark"),
names_02 = c("sec_type", "sec_symbol", "sec_name", "isin_code", "start_date", "market", "sector", "cfi_code", "remark"),
names_03 = c("sec_type", "sec_symbol", "sec_name", "isin_code", "start_date", "end_date", "intr_rate", "market", "sector", "cfi_code", "remark"),
names_04 = c("sec_type", "sec_symbol", "sec_name", "isin_code", "start_date", "market", "sector", "cfi_code", "remark"),
names_05 = c("sec_type", "sec_symbol", "sec_name", "isin_code", "start_date", "market", "sector", "cfi_code", "remark"),
names_06 = c("sec_type", "sec_symbol", "sec_name", "isin_code", "start_date", "market", "sector", "cfi_code", "remark"),
names_07 = c("sec_symbol", "sec_name", "isin_code", "start_date", "cfi_code", "remark"),
names_08 = c("sec_type", "sec_symbol", "sec_name", "isin_code", "start_date", "sector", "cfi_code", "remark"),
names_09 = c("sec_symbol", "sec_name", "isin_code", "start_date", "cfi_code", "remark"),
names_10 = c("sec_name", "isin_code", "start_date", "cfi_code", "remark"),
names_11 = c("sec_symbol", "sec_name", "isin_code", "start_date", "cfi_code", "remark"))
d$df <- purrr::map2(
.x = d$data,
.y = c(tidy01, tidy02, tidy03, tidy04, tidy05, tidy06, tidy07, tidy08, tidy09, tidy10, tidy11),
.f = ~ .y(.x))
d$df <- purrr::map2(
.x = d$df,
.y = d$colen,
.f = ~ purrr::set_names(.x, .y))
d$df <- purrr::map2(
.x = d$df,
.y = d$pageno,
.f = ~ dplyr::mutate(.x, no = .y))
d$df <- purrr::map2(
.x = d$df,
.y = d$title,
.f = ~ dplyr::mutate(.x, title = .y))
d$df <- purrr::map2(
.x = d$df,
.y = d$dated,
.f = ~ dplyr::mutate(.x, dated = .y))
d$df %>%
dplyr::bind_rows() %>%
dplyr::select("no", "title", "dated", "sec_type", "sec_symbol", "sec_name", "isin_code", "start_date", "end_date", "intr_rate", "market", "sector", "cfi_code", "remark") %>%
dplyr::mutate_if(is.character, list(~stringr::str_trim(., side = "both"))) %>%
dplyr::mutate(title = stringr::str_remove(title, pattern = ",?國際證券辨識號碼一覽表"))
}
tw_getSec <- function(stockonly = FALSE) {
allsec <- getAllSec()
function() {
if (!stockonly) return(allsec)
else return(
allsec %>% dplyr::filter(
no %in% c(2, 4, 5),
market %in% c("上市", "上櫃", "興櫃"),
sec_type == "股票"
)
)
}
}()
#' Taiwan all securities list.
#'
#' @return a tibble contains following variables: no, title, dated,
#' sec_type, sec_symbol, sec_name, isin_code, start_date, end_date,
#' intr_rate, market, sector, cfi_code, remark.
#' @export
#' @examples
#' tw_getAllSec()
tw_getAllSec <- function() tw_getSec()
#' Taiwan stocks list.
#'
#' @return a tibble contains following columns:
#' market, sec_symbol, sec_name, isin_code, start_date, sector.
#' @export
#' @examples
#' tw_getStocks()
tw_getStocks <- function() {
tw_getSec() %>%
dplyr::filter(
no %in% c(2, 4, 5),
market %in% c("上市", "上櫃", "興櫃"),
sec_type == "股票") %>%
dplyr::select(market, sec_symbol, sec_name, isin_code, start_date, sector)
}
#' Taiwan TWSE stocks list.
#'
#' @return a tibble contains following columns:
#' sec_symbol, sec_name, isin_code, start_date, sector.
#' @export
#' @examples
#' tw_getStocks()
tw_twse <- function() {
tw_getStocks() %>%
dplyr::filter(market == "上市") %>%
dplyr::select(-market)
}
#' Taiwan TPEX stocks list.
#'
#' @return a tibble contains following columns:
#' sec_symbol, sec_name, isin_code, start_date, sector.
#' @export
#' @examples
#' tw_getStocks()
tw_tpex <- function() {
tw_getStocks() %>%
dplyr::filter(market == "上櫃") %>%
dplyr::select(-market)
}
#' setup a sqlite db and populate with balance sheet, income statement, monthly revenue, stock price, split and dividend of listed stocks in twse and tpex market.
#'
#' @param dbname a path to locate the sqlite db.
#' @return a list of objects, tables, and fields of securities
#' @export
#' @examples
#' tw_secdb()
tw_secdb <- function(dbname = "./../data/db/sec") {
dbexist <- fs::file_exists(path = dbname)
sec_con <- DBI::dbConnect(drv = RSQLite::SQLite(), dbname = dbname)
new <- getAllSec() %>% dplyr::mutate(update = lubridate::now() %>% as.numeric())
if (dbexist) {
old <- DBI::dbReadTable(sec_con, "seclist")
upd <- dplyr::bind_rows(new, old) %>%
arrange(isin_code, desc(update)) %>%
mutate(isin_dup = duplicated(isin_code)) %>%
filter(isin_dup == FALSE) %>%
select(-isin_dup)
} else {
upd <- new
}
DBI::dbWriteTable(sec_con, "seclist", upd, append = FALSE, temporary = FALSE)
fs::file_copy(
path = dbname,
new_path = stringr::str_c(dbname, "_",
lubridate::now() %>% as.character() %>%
str_replace(pattern = " ", replacement = "_") %>%
str_replace_all(":", "-")))
out <- list(
# DBI::dbGetInfo(sec_con), # this function is deprecated
DBI::dbListObjects(sec_con),
DBI::dbListTables(sec_con),
DBI::dbListFields(sec_con, "seclist")
)
DBI::dbDisconnect(sec_con)
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.