R/web.search.content.R
In Rweibo: An interface to the Weibo open platform

##' Search content by word.
##' 
##' @title Search content by word.
##' @param sword The word to be searched.
##' @param page The number of the total pages of the search result. The default value is 1. The limited value is 50.
##' @param combinewith A data.frame of the previous search result. The default value is NULL.
##' @param since If not NULL, restricts weibos to those since the given date. Date should be an object of class \code{\link{POSIXlt}} or string formatted as YYYY-MM-DD
##' @param sinceID If not NULL, returns weibos with IDs greater (ie newer) than the specified ID.
##' @param sleepmean Mean of the sleeping time before each searching.
##' @param sleepsd Standard deviations of the sleeping time before each searching.
##' @param roauth A OAuth object created by \code{\link{createOAuth}}.
##' @param ... Other request parameters for this API.
##' @return 
##'  A data.frame of weibos, each weibo contains: 
##'  \item{MID}{Weibo ID of string}
##'  \item{Author}{Nick name of the author}
##'  \item{Weibo}{Weibo content}
##'  \item{Forward}{Forward content}
##'  \item{Time_Weibo}{Time of this weibo}
##'  \item{Time_Search}{Time of this search}
##'  \item{Count_Forward}{Count of replys}
##'  \item{Count_Reply}{Count of forwards}
##'  
##' @author Jian Li <\email{rweibo@@sina.com}>
##' @keywords Search
##' @examples \dontrun{
##' web.search.content("Rweibo")$Weibo
##' web.search.content("Rweibo", sinceID = "3508307023192146")$Weibo
##' web.search.content("Rweibo", since = "2012-10-01")$Weibo
##' }

web.search.content <- function(sword, page = 1, combinewith = NULL, 
		since = NULL, sinceID = NULL, sleepmean = 3, sleepsd = 1, 
		roauth = NULL, ...) 
{
	if (length(page) == 1) page = 1:page
	page <- page[page > 0 & page <= 50]
	page <- sort(page)
	if (length(page) > 25) page <- page[1:25]
	Search <- TRUE
	ipage <- 1
	if (!is.null(combinewith)) {
		if (all(c("MID", "Author", "Weibo", "Forward", "Time_Weibo", "Time_Search", "Count_Forward", "Count_Reply") %in% names(combinewith))) {
			OUT <- combinewith[, c("MID", "Author", "Weibo", "Forward", "Time_Weibo", "Time_Search", "Count_Forward", "Count_Reply")]
			maxid <- max(as.numeric(OUT$MID))
		} else {
			OUT <- data.frame(stringsAsFactors = FALSE)
			maxid <- 0
			warning("Ignored 'combinewith' because of wrong format!")
		}
	} else {
		OUT <- data.frame(stringsAsFactors = FALSE)
		maxid <- 0
	}
	
	if (!is.null(sinceID)) {
		maxid <- max(maxid, as.numeric(sinceID))
	}
	
	if (is.null(since)) {
		maxdate <- -Inf
	} else {
		if (inherits(since, "character")) {
			since <- strptime(since, format = "%Y-%m-%d")
			if (is.na(since)) {
				warning("Ignore 'since' because of the wrong format!")
				maxdate <- -Inf
			}
		}
		if (inherits(since, "POSIXlt")) maxdate <- since
	}
	
	while (Search && ipage <= length(page)) {
		Sys.sleep(abs(rnorm(1, sleepmean, sleepsd)))
		if (is.null(roauth)) {
			tmp.search <- try(.search.content.anon(sword, page[ipage]), silent = TRUE)
		} else {
			tmp.search <- try(.search.content.curl(sword, page[ipage], roauth$webCurl), silent = TRUE)
		}
		ipage <- ipage + 1
		if (is.null(tmp.search)) {
			cat(paste(ipage - 2, " pages was stored!\n", sep = ""))
			Search <- FALSE
		} else if (inherits(tmp.search, "try-error")){
			warning(paste("Error in page ", ipage - 1, sep = ""))
		} else {
			if (min(as.numeric(tmp.search$MID)) <= maxid || min(tmp.search$Time_Weibo) < maxdate) {
				Search <- FALSE
				tmp.search <- tmp.search[as.numeric(tmp.search$MID) > maxid & tmp.search$Time_Weibo >= maxdate, ]
			}
			OUT <- rbind(tmp.search, OUT)
		}
	}
	OUT <- OUT[order(as.numeric(OUT$MID), decreasing = TRUE), ]
	return(OUT)
}


.search.content.anon <- function(sword, page = 1, ...) {
	requestURL <- "http://s.weibo.com/weibo/"
	sword <- curlEscape(.cntoUTF8(sword))
	strurl <- paste(requestURL, sword, "&xsort=time&page=", page, sep = "") # time sorting 
	pagetree <- htmlParse(strurl)
	
	pagenode <- getNodeSet(pagetree, "//script")
	
	pagescript <- sapply(pagenode, xmlValue)
	weiboline <- pagescript[grep("\"pid\":\"pl_weibo_direct\"", pagescript)]
	weibojson <- gsub("\\)$", "", gsub("^.*STK.pageletM.view\\(", "", weiboline))
	if (length(weibojson) == 0) {
		warning("Can not crawl any page now. May be forbidden by Sina temporarily.", call. = FALSE)
		return(NULL)
	}
	weibolist <- .fromJSON(weibojson)
	
	weibopage <- htmlParse(weibolist[["html"]], asText=TRUE, encoding = "UTF-8")
	
	weiboitem.attr <- getNodeSet(weibopage, "//dl[@action-type='feed_list_item']")
	weiboitem.con <- getNodeSet(weibopage, "//dd[@class='content']")
	weiboitem.nores <- getNodeSet(weibopage, "//div[@class='pl_noresult']")
	
	if (length(weiboitem.nores) == 0) {
		res.mid <- sapply(weiboitem.attr, function(X) xmlGetAttr(X, "mid"))
		res.con <- sapply(weiboitem.con, FUN = function(X) xmlValue(getNodeSet(X, "p[@node-type='feed_list_content']")[[1]]))	
		res.name <- sapply(weiboitem.con, FUN = function(X) xmlGetAttr(getNodeSet(X, "p[@node-type='feed_list_content']/a")[[1]], "nick-name"))
		res.date <- sapply(weiboitem.con, FUN = function(X) xmlGetAttr(getNodeSet(X, "p/a[@node-type='feed_list_item_date']")[[1]], "title"))
		res.stat <- lapply(weiboitem.con, FUN = function(X) sapply(getNodeSet(X, "p/span/a"), xmlValue))
		res.forward <- sapply(weiboitem.con, FUN = function(X) {
					tmp.node <- getNodeSet(X, "dl/dt[@node-type='feed_list_forwardContent']")
					if (length(tmp.node) == 0) {
						NA
					} else {
						xmlValue(tmp.node[[1]])
					}
				}
		)
		Encoding(res.name) <- "UTF-8"
		res.con <- .strtrim(res.con)
		res.forward <- .strtrim(res.forward)
		res.date <- strptime(res.date, format = "%Y-%m-%d %H:%M")
		res.stat.f <- as.numeric(gsub("[^0-9]", "", sapply(res.stat, FUN = function(X) X[grep("\u8F6C\u53D1", X)])))
		res.stat.r <- as.numeric(gsub("[^0-9]", "", sapply(res.stat, FUN = function(X) X[grep("\u8BC4\u8BBA", X)])))
		res.stat.f[is.na(res.stat.f)] <- 0
		res.stat.r[is.na(res.stat.r)] <- 0
		
		OUT <- data.frame(MID = res.mid, Author = res.name, Weibo = res.con, Forward = res.forward, Time_Weibo = res.date,
				Time_Search = Sys.time(), Count_Forward = res.stat.f, Count_Reply = res.stat.r, stringsAsFactors = FALSE)
		OUT$Weibo <- sapply(seq_along(OUT$Weibo), FUN = function(X) 
					gsub(paste("^ *", OUT$Author[X], "\uFF1A *", sep = ""), "", OUT$Weibo[X]))
	} else {
		OUT <- NULL
	}
	
	return(OUT)
}


.search.content.curl <- function(sword, page = 1, curl = NULL, ...) {
	requestURL <- "http://s.weibo.com/weibo/"
	sword <- curlEscape(.cntoUTF8(sword))
	strurl <- paste(requestURL, sword, "&xsort=time&page=", page, sep = "") # time sorting 
	
	resXML <- getURL(strurl, curl = curl, .encoding = 'UTF-8')
	resHTMLs <- .strextract(resXML, "<script>.+?</script>")[[1]]
	resHTML <- resHTMLs[grep("\"pid\":\"pl_weibo_feedlist\"", resHTMLs)][1]
	if (is.na(resHTML)) {
		warning("Can not crawl any page now. May be forbidden by Sina temporarily.", call. = FALSE)
		return(NULL)
	}

	weibojson <- gsub("\\)</script>$", "", gsub("^.*STK.pageletM.view\\(", "", resHTML))
	weibolist <- .fromJSON(weibojson)
	
	weibopage <- htmlParse(weibolist[["html"]], asText=TRUE, encoding = "UTF-8")
	
	weiboitem.attr <- getNodeSet(weibopage, "//dl[@class='feed_list']")
	weiboitem.con <- getNodeSet(weibopage, "//dd[@class='content']")
	weiboitem.nores <- getNodeSet(weibopage, "//div[@class='pl_noresult']")
	
	if (length(weiboitem.nores) == 0) {
		res.mid <- sapply(weiboitem.attr, function(X) xmlGetAttr(X, "mid"))
		res.con <- sapply(weiboitem.con, FUN = function(X) xmlValue(getNodeSet(X, "p[@node-type='feed_list_content']")[[1]]))	
		res.name <- sapply(weiboitem.con, FUN = function(X) xmlGetAttr(getNodeSet(X, "p[@node-type='feed_list_content']/a")[[1]], "nick-name"))
		res.date <- sapply(weiboitem.con, FUN = function(X) xmlGetAttr(getNodeSet(X, "p/a[@node-type='feed_list_item_date']")[[1]], "title"))
		res.stat <- lapply(weiboitem.con, FUN = function(X) sapply(getNodeSet(X, "p/span/a"), xmlValue))
		res.forward <- sapply(weiboitem.con, FUN = function(X) {
					tmp.node <- getNodeSet(X, "dl/dt[@node-type='feed_list_forwardContent']")
					if (length(tmp.node) == 0) {
						NA
					} else {
						xmlValue(tmp.node[[1]])
					}
				}
		)
		Encoding(res.name) <- "UTF-8"			
		res.con <- .strtrim(res.con)
		res.forward <- .strtrim(res.forward)
		res.date <- strptime(res.date, format = "%Y-%m-%d %H:%M")
		res.stat.f <- as.numeric(gsub("[^0-9]", "", sapply(res.stat, FUN = function(X) X[grep("\u8F6C\u53D1", X)])))
		res.stat.r <- as.numeric(gsub("[^0-9]", "", sapply(res.stat, FUN = function(X) X[grep("\u8BC4\u8BBA", X)])))
		res.stat.f[is.na(res.stat.f)] <- 0
		res.stat.r[is.na(res.stat.r)] <- 0
		
		OUT <- data.frame(MID = res.mid, Author = res.name, Weibo = res.con, Forward = res.forward, Time_Weibo = res.date,
				Time_Search = Sys.time(), Count_Forward = res.stat.f, Count_Reply = res.stat.r, stringsAsFactors = FALSE)
		OUT$Weibo <- sapply(seq_along(OUT$Weibo), FUN = function(X) 
					gsub(paste("^ *", OUT$Author[X], "\uFF1A *", sep = ""), "", OUT$Weibo[X]))
	} else {
		OUT <- NULL
	}
	
	return(OUT)
}