R/screamingfrog_crawlVsSitemap.R

Defines functions screamingfrog_crawlVsSitemap

Documented in screamingfrog_crawlVsSitemap

#' Function to compare the crawled URLs with the URLs in the sitemap.xml
#'
#' This function allows to compare the crawled URLs with the URLs in the sitemap.xml. The Output are the missing URLs - in the Sitemap or in the Crawl
#' @param crawl The Path to the exportet Screaming Frog csv-File
#' @param sitemap The Sitemap you want to compare with the Crawl
#' @param deltaIn The Delta you want to analyze. Do you want to get the URLs in the Crawl missing in the Sitemap = "crawl" or the URLs in the Sitemap.xml not found in the Crawl = "sitemap"
#' @param checkImages Logical. Do you want to check the Images as well. Default is FALSE.
#' screamingfrog_crawlVsSitemap()



screamingfrog_crawlVsSitemap <-
  function(crawl,
           sitemap,
           deltaIn = "sitemap",
           checkImages = FALSE) {
    #Import the Screaming Frog Crawl data
    sc_crawl <- readr::read_csv(crawl, skip = 1)

    crawled_pages <-
      as.data.frame(cbind(sc_crawl$Address, sc_crawl$Content)) # Just HTML-Pages
    colnames(crawled_pages)[1] <- "loc"
    #Just keep HTML
    if (checkImages == FALSE) {
      #Just keep HTML
      V2<-NULL;
      crawled_pages <-
        subset(crawled_pages, V2 == "text/html; charset=UTF-8")
    } else {
      #Keep HTML and Images
      crawled_pages <-
        subset(
          crawled_pages,
          V2 == "text/html; charset=UTF-8" |
            V2 == "image/png" | V2 == "image/jpeg"  | V2 == "image/gif"
        )
    }

    #Download the Sitemap.xml
    sm <- downloadSitemap(sitemap)
    sm <- as.data.frame(sm$loc)
    colnames(sm)[1] <- "loc"

    if (deltaIn == "sitemap") {
      res <- dplyr::anti_join(sm, crawled_pages, by = "loc")
      colnames(res)[1] <- "In_Sitemap_not_in_Crawl"
    } else {
      res <- dplyr::anti_join(crawled_pages, sm, by = "loc")
      res <- as.data.frame(res$loc)
      colnames(res)[1] <- "In_Crawl_not_in_Sitemap"
    }
    return(res)

  }
dschmeh/seoR documentation built on Jan. 7, 2023, 12:19 a.m.