R/xsitemapGet.R

Defines functions xsitemapGet

Documented in xsitemapGet

#' xsitemapGet
#'
#' @param urltocheck direct xml sitemap url or hostname string of the website you want to find xml sitemap from
#' @param user_agent user agent string
#'
#' @return dataframe
#' @export
#'
xsitemapGet <- function(urltocheck, user_agent) {
  if(missing(user_agent)) {
    user_agent <-
      "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
  }

  URL <- url_parse(urltocheck)


  if (is.na(URL$path) && !is.na(URL$domain)) {
    message("Searching for XML Sitemap URL...")


    # Reaching for robots.txt to find XML Sitemap

    xmlsitemap_from_robots <- xsitemapGetFromRobotsTxt(urltocheck, user_agent)

    if (xmlsitemap_from_robots != "") {
      xsitemapGet(xmlsitemap_from_robots, user_agent)
    } else {
      xmlsitemap_from_guessing <- xsitemapGuess(urltocheck, user_agent)
      if (!is.null(xmlsitemap_from_guessing)) {
        xsitemapGet(xmlsitemap_from_guessing, user_agent)

      } else{
        warning(paste("Can't find xml sitemap url for",urltocheck))
        return(FALSE)
      }
    }




  } else if (!is.na(URL$path)) {
    message(paste("Reaching for XML sitemap...", urltocheck))


    request <- GET(urltocheck, user_agent(user_agent))

    if (request$status_code != 200 && request$status_code != 301) {
      warning(paste(
        "xml sitemap is not accessible (HTTP:",
        request$status_code
      ))
      return(NULL)
    }



    xml_doc <- xmlParse(request, encoding = "UTF-8", options = NOCDATA)





    xml_data <- xmlToList(xml_doc)
    
    # Checking if this is a list
    if(!is.list(xml_data)) {
      warning("empty sitemap index")
      return(NULL)    
    }
    
    
    nb_children <- length(xml_data)
    

    # Checking if this is a sitemap index structure
    if (!is.null(xml_data$sitemap$loc)) {
      if (nb_children < 50001) {
        message(paste(
          "sitemap index detected - ",
          nb_children,
          " sitemap url(s) found"
        ))

      } else{
        warning(paste("too many URLs - ",
                      nb_children,
                      " web page url(s) found"))
        return(NULL)

      }



      urls <- data.frame(
        loc = character(),
        lastmod = as.Date(character(), format = "%Y-%m-%d"),
        stringsAsFactors = FALSE
      )

      for (i in 1:nb_children) {
        individual_sitemap <-  xml_data[i]$sitemap$loc
        if (!is.null(individual_sitemap)) {
          message(paste0("\n", i, " >>> ", individual_sitemap))
          new_urls <- xsitemapGet(individual_sitemap)
          parsed_urls <- urltools::url_parse(individual_sitemap)

          if(!is.na(parsed_urls$parameter)){
            new_urls$origin <-
              paste0(parsed_urls$path,"?",parsed_urls$parameter)
          }else{
            new_urls$origin <-
              parsed_urls$path

          }


          urls <- rbind(urls, new_urls)
        }
      }
      rownames(urls) <- NULL
      return(urls)

    } else{
      # if (nb_children < 50000) {
      message(paste(
        "regular sitemap detected - ",
        nb_children,
        " web page url(s) found"
      ))
      #} else{
      #  warning(paste("too many URLs - ",
      #                nb_children,
      #                " web page url(s) found"))
      #  return(NULL)



      #urls <- vector(mode = "character", length = 0)

      urls <- data.frame(
        loc = character(),
        lastmod = as.Date(character(), format = "%Y-%m-%d"),
        stringsAsFactors = FALSE
      )


      for (i in 1:(nb_children - 1)) {
        cat(".")
        urls[i, ]$loc <- xml_data[i]$url$loc
        if (!is.null(xml_data[i]$url$lastmod)) {
          urls[i, ]$lastmod <- xml_data[i]$url$lastmod
        }
      }

      rownames(urls) <- NULL
      return(urls)
    }

  } else{
    stop(paste("Mal formatted url",urltocheck))

    #return NA
  }

}
pixgarden/xsitemap documentation built on July 28, 2023, 10:28 a.m.