f_weibo_repost_oneshot <- function(cH=ch0,
oneshot_url='http://weibo.com/2043157342/zcdFaeuD1'){
# 先看看有多少转发
the1url <- paste(oneshot_url, '?type=repost&page=1', sep='')
the1get <- getURL(the1url, curl=cH, .encoding='UTF-8')
weibodetail <- '^.*<script>STK && STK.pageletM && STK.pageletM.view\\((\\{\"pid\":\"pl_content_weiboDetail\".+?\\})\\)</script>.*$'
a1 <- gsub(weibodetail, '\\1', the1get)
a1 <- fromJSON(a1)[['html']]
if(grepl('还没有人转发', a1)){
return(NULL)
} else{
a11 <- htmlParse(a1, encoding='UTF-8')
# 分页
a112 <- getNodeSet(a11, path='//div[@class="W_pages W_pages_comment"]//a|//div[@class="W_pages_minibtn"]//a')
if(is.null(a112)){
page_cnt <- 1
} else{
page_cnt <- max(as.numeric(sapply(a112, xmlValue)), na.rm=T)
}
raw_data <- NULL
for(pg_i in seq_len(page_cnt)){
theiurl <- paste(oneshot_url, '?type=repost&page=', pg_i, sep='')
theiget <- getURL(theiurl, curl=cH, .encoding='UTF-8')
weibodetail <- '^.*<script>STK && STK.pageletM && STK.pageletM.view\\((\\{\"pid\":\"pl_content_weiboDetail\".+?\\})\\)</script>.*$'
a1 <- gsub(weibodetail, '\\1', theiget)
a1 <- fromJSON(a1)[['html']]
raw_data <- c(raw_data, a1)
cat(pg_i, '\n')
Sys.sleep(runif(n=1,min=0.001,max=0.01))
}
a11 <- htmlParse(raw_data, encoding='UTF-8')
# a111 <- getNodeSet(a11, path='//a[@class="S_link2 WB_time"]')
# root_time <- as.POSIXlt(sapply(a111, xmlValue)[1])
a113 <- getNodeSet(a11, path='//div[@node-type="feed_list"]//dl[@class="comment_list S_line1 clearfix WB_feed_type "]//dd//div//a[@action-type="feed_list_forward"]|
//div[@node-type="feed_list"]//dl[@class="comment_list W_linecolor clearfix"]//dd//a[@action-type="feed_list_forward"]')
repost_repost <- iconv(sapply(a113, xmlValue), 'UTF-8', 'UTF-8')
repost_repost <- as.numeric(gsub('[^0-9]','',repost_repost))
repost_info <- iconv(sapply(a113, xmlGetAttr, 'action-data'), 'UTF-8', 'UTF-8')
rootmid <- gsub('^.*rootmid=([^&]+)&.*$', '\\1', repost_info)
rootname <- gsub('^.*rootname=([^&]+)&.*$', '\\1', repost_info)
rootuid <- gsub('^.*rootuid=([^&]+)&.*$', '\\1', repost_info)
rooturl <- gsub('^.*rooturl=([^&]+)&.*$', '\\1', repost_info)
reposturl <- gsub('^.*&url=([^&]+)&.*$', '\\1', repost_info)
repostmid <- gsub('^.*&mid=([^&]+)&.*$', '\\1', repost_info)
repostname <- gsub('^.*&name=([^&]+)&.*$', '\\1', repost_info)
repostuid <- gsub('^.*&uid=([^&]+)&.*$', '\\1', repost_info)
output <- unique(as.data.frame(cbind(rootmid,rootname,rootuid,rooturl,repost_repost,
reposturl,repostmid,repostname,repostuid), stringsAsFactors=F))
return(output)
}
}
f_weibo_repost_path <- function(cH=ch0, hisID='xiaonan',
root_url='http://weibo.com/2043157342/zcdFaeuD1'){
require(RJSONIO)
require(RCurl)
require(XML)
root_df <- f_weibo_repost_oneshot(cH=cH, oneshot_url=root_url)
if(is.null(root_df)){
return(NULL)
cat('there are no repost for this weibo!!!', '\n')
} else{
repost_df <- NULL
lost_df <- NULL
for(repost_url in root_df$reposturl[!is.na(root_df$repost_repost)]){
idx <- match(repost_url,root_df$reposturl)
new_df <- f_weibo_repost_oneshot(cH=cH, oneshot_url=repost_url)
if(!is.null(new_df)){
if(nrow(new_df) > 0){
new_df$rootmid <- root_df$repostmid[idx]
new_df$rootname <- root_df$repostname[idx]
new_df$rootuid <- root_df$repostuid[idx]
new_df$rooturl <- root_df$reposturl[idx]
repost_df <- rbind(repost_df, new_df)
}
}
cnt_delta <- ifelse(!is.null(new_df), nrow(new_df), 0) - as.numeric(root_df$repost_repost[idx])
cat(cnt_delta, idx, ' have to be positive, perfect to be zero. \n')
lost_df <- rbind(lost_df, data.frame(cnt_delta, idx))
cat('进度', idx/nrow(root_df), '\n')
}
root_df2 <- root_df[!root_df$reposturl %in% repost_df$reposturl, ]
result_df <- rbind(root_df2, repost_df)
weibo_repost <- list(result_df=result_df, lost_df=lost_df)
save(weibo_repost, file=paste('weibo_saved_repost_', hisID,'.RData', sep=''))
return(weibo_repost)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.