#' China Times
#'
#' @param dateWant \code{"yyyymmdd"}.
#'
#' @return
#' data.table object with columns:
#'
#' @examples
#' getAllNewsUdn("20161001")
#'
#' @export
getAllNewsChinaTimes <- function(dateWant, sleepBetween = 5, sleepTime = 1, printSts=TRUE){
PageUrl<- getPageUrlChinaTimes(dateWant)
UrlList <- lapply(PageUrl, getUrlListChinaTimes)
UrlList <- do.call(rbind,UrlList)
# newsList <- lapply(UrlList$URL, getNewsChinaTimes)
newsList <- list()
for( i in 1:length(UrlList$URL)){
newsList[[i]] <- getNewsChinaTimes(UrlList$URL[i])
if ( (i %% sleepBetween) ==0) Sys.sleep(sleepTime)
if ( printSts&(i%%100 ==0) ) cat('中國時報 ', i, 'is ok', '\n')
}
do.call(rbind, newsList)
}
getPageUrlChinaTimes <- function(dateWant){
wantURL <- sprintf('http://www.chinatimes.com/history-by-date/%s-%s-%s-2601?page=%s', substr(dateWant,1,4), substr(dateWant,5,6), substr(dateWant,7,8), 1)
res <- GET(wantURL, encoding='utf8')
res2 <- content(res, encoding='utf8')
(maxPage<- xpathSApply(res2, '//div[@class="pagination clear-fix"]/ul/li/a', xmlAttrs))
maxPage <- str_replace(str_extract(maxPage[length(maxPage)],'\\?page=[0-9]+$'),'\\?page=', '')
wantPages<- sapply(1:maxPage,
function(wantPage) sprintf('http://www.chinatimes.com/history-by-date/%s-%s-%s-2601?page=%s', substr(dateWant,1,4), substr(dateWant,5,6), substr(dateWant,7,8), wantPage)
)
return(wantPages)
}
getUrlListChinaTimes <- function(URL, sleepTime = 0){
Sys.sleep(sleepTime)
res <- GET(URL)
res2 <- content(res, encoding='utf8')
UrlList <- xpathSApply(res2, '//div[@class="listRight"]/ul/li/h2/a', xmlAttrs)
UrlList <- UrlList[rownames(UrlList)=='href',]
UrlListCate <- xpathSApply(res2, '//div[@class="listRight"]/ul/li/div[@class="kindOf"]/a', xmlValue)
UrlListCate <- str_replace_all(UrlListCate, '[:space:]', '')
getNewUrL <- function(URL){
wantURL <- URLencode(sprintf('http://www.chinatimes.com%s',URL))
}
UrlList2 <- unlist(lapply(UrlList, getNewUrL))
return(data.frame(URL=UrlList2,cate=UrlListCate, stringsAsFactors = FALSE))
}
getNewsChinaTimes <- function(URL, sleepTime = 0){
Sys.sleep(sleepTime)
res <- content(GET(URL), encoding='utf8')
newsCate <- xpathSApply(res, '//article[@class="clear-fix"]/ul/li/h6', xmlValue)
newsCate <- newsCate[length(newsCate)]
newsCate <- str_replace_all(newsCate, '[:space:]', '')
newsTitle <- xpathSApply(res, '//article[@class="clear-fix"]/header/h1', xmlValue)
newsTitle <- str_replace_all(newsTitle, '[:space:]', '')
newsText <- xpathSApply(res, '//article[@class="clear-fix"]/article[@class="clear-fix"]/p', xmlValue)
newsText <- paste(newsText, collapse = '\n')
newsAuthor <- xpathSApply(res, '//div[@class="reporter"]/div', xmlValue)
newsAuthor <- str_replace_all(newsAuthor, '整理|記者|/.+|/.+|.+基金經理人|.+部門|主管|\xa2\xac.+', '')
newsAuthor <- paste(newsAuthor, collapse = '&')
newsDate <- xpathSApply(res, '//div[@class="reporter"]/time', xmlValue)
newsDate <- str_replace_all(newsDate, '[:space:]', '')
newsTime <- str_replace_all(str_extract(newsDate, '[0-9]{2}:[0-9]{2}'), ':', '')
newsDate <- str_replace_all(str_extract(newsDate, '[0-9]{4}年[0-9]{2}月[0-9]{2}日'), '年|月|日', '')
newsClick <- xpathSApply(res, '//div[@class="article_star clear-fix"]/div[@class="art_click clear-fix"]/span[@class="num"]', xmlValue)[1]
newsClick <- ifelse(is.character(newsClick),newsClick, NA)
newsClick <- ifelse(is.list(newsClick),0,newsClick)
# URLQry <- str_replace_all(str_replace_all(URL, '/', '%2F'), ':' ,'%3A')
# URLnewsShare1 <- paste0('http://graph.facebook.com/fql?q=SELECT%20share_count,%20like_count,%20comment_count,%20total_count,%20comments_fbid,%20click_count%20FROM%20link_stat%20WHERE%20url=%22',URLQry,'%22','&callback=_ate.cbs.rcb_httpwwwchinatimescom0')
# newsShare1Content <- str_extract(rawToChar(GET(URLnewsShare1)$content), '\\{.+\\}')
# newsShare1Content <- rjson::fromJSON(newsShare1Content)
# newsShare1 <- newsShare1Content$data[[1]]$total_count
# newsShare1 <- ifelse(is.null(newsShare1),0,newsShare1)
#
# URLnewsShare2 <- paste(c('https://cdn.api.twitter.com/1/urls/count.json?url=',URLQry,'&callback=_ate.cbs.rcb_httpwwwchinatimescom0'), collapse = '')
# newsShare2Content <- str_extract(rawToChar(GET(URLnewsShare2)$content), '\\{.+\\}')
# newsShare2Content <- rjson::fromJSON(newsShare2Content)
# newsShare2 <- newsShare2Content$count
# newsShare2 <- ifelse(is.null(newsShare2),0,newsShare2)
#
# URLnewsShare3 <- paste0('http://api-public.addthis.com/url/shares.json?url=',URLQry, '&callback=_ate.cbs.rcb_httpwwwchinatimescom0')
# newsShare3Content <- rawToChar(GET(URLnewsShare3)$content)
# newsShare3Content <- str_extract(newsShare3Content, '\\{.+\\}')
# newsShare3Content <- rjson::fromJSON(newsShare3Content)
# newsShare3 <- newsShare3Content$shares
# newsShare3 <- ifelse(is.null(newsShare3),0,newsShare3)
result <- try(data.frame(newsWebsite='中國時報', newsCate=newsCate, newsAuthor=newsAuthor,
newsDate=newsDate, newsTime=newsTime,
newsTitle=newsTitle, newsText=newsText,
newsClick=newsClick, newsShare=NA,
href=URL, createTime=format(Sys.time(), '%Y%m%d%H%M%S'),
stringsAsFactors=FALSE ), silent = TRUE)
if( inherits(result, 'try-error')){
result <- data.frame(newsWebsite='中國時報', newsCate=NA, newsAuthor=NA, newsDate=NA, newsTime=NA,
newsTitle=NA, newsText=NA,
newsClick=NA, newsShare=NA,
href=URL, createTime=format(Sys.time(), '%Y%m%d%H%M%S'),
stringsAsFactors=FALSE )
}
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.