R/R/getMainPagesContent.R

Defines functions getMainPagesContent

getMainPagesContent <- function(name, firstPage, input, src="TripAdvisor"){
  # Cycle through the list of potential links and gather only valid links content
  require(XML)
  library(httr)
  library(dplyr)
  
  preURL  <-  paste0(strsplit(firstPage,"Reviews-")[[1]][1],"Reviews",sep="")
  postURL <-  strsplit(firstPage,"Reviews-")[[1]][2]
  
  print("Retrieve first Page")
  pageContent <- htmlTreeParse(rawToChar(GET(firstPage)$content),useInternalNodes=TRUE)
  i=input$start
  reviewNbr=getNodeSet(pageContent,"//div[@class='rs rating']//a[@class='more taLnk']")
  reviewNbr=sapply(reviewNbr,function(x) xmlValue(x))
  reviewNbr <- ceiling(as.numeric(gsub(" Reviews","",reviewNbr))/5)*5
  if (input$pageNbr<reviewNbr) endpage <- input$pageNbr else endpage <- reviewNbr
  
  source("functions/getMPContent.R")
  mainPagesContent <- getMPContent(pageContent)
  nextURL <- paste0(preURL,"-or",i,"-",postURL)
  
  while (i<endpage){
    print(paste("Retrieve Main Page ",i))
    
    try(rm(pageContent), silent=TRUE)
    try(pageContent <- htmlTreeParse(rawToChar(GET(nextURL)$content),useInternalNodes=TRUE), silent=TRUE)
    attempt <- 1
    while(is.null(pageContent) && attempt<=3){
      print(paste0("attempt ", attempt))
      attempt <- attempt+1
      Sys.sleep(60)
      try(pageContent <- htmlTreeParse(rawToChar(GET(nextURL)$content),useInternalNodes=TRUE), silent=TRUE)
    }
    
    tableContent <- getMPContent(pageContent)
    if (!is.na(tableContent[1,"id"])){
      mainPagesContent <- rbind(mainPagesContent, tableContent)
      i=i+5
      nextURL <- paste0(preURL,"-or",i,"-",postURL)
    } else i=endpage
    Sys.sleep(1+floor(runif(1,min=0, max=1)))
  }
  
  library(date)
  mainPagesContent$ratingDate <- as.date(as.character(mainPagesContent$ratingDate), order="mdy")
  mainPagesContent$year <- date.mdy(mainPagesContent$ratingDate)[[3]]
  mainPagesContent$month <- date.mdy(mainPagesContent$ratingDate)[[1]]
  mainPagesContent$day <- date.mdy(mainPagesContent$ratingDate)[[2]]
  mainPagesContent$ratingDate <- date.ddmmmyy(mainPagesContent$ratingDate)
  
  return(mainPagesContent)
}
NicolasJBM/tripapp documentation built on Nov. 18, 2017, 8:49 a.m.