R/getTitles.R

Defines functions getTitles

Documented in getTitles

#' @title Get url of Specific Story to be Scraped
#'
#' @description
#' \code{getTitles.absentfan} Helper function of getChapter iterating through all entry titles in fandom listing
#' in search of title queried by user.
#' @param returns A list generated by \code{getPagesTitle.absentfan} including noPages (bool whether there are multiple pages of entries to scrape), page_use (href, if needed), page_num (total number of pages to iterate), url, and story title.
#'
#' @return A list including a url (of fanfiction.net) and the href of the story being queried.
#' @examples
#' getTitles(list(noPages=FALSE, page_use="/book/Harry-Potter/?&srt=1&r=103&p=",page_num=11,url="https://www.fanfiction.net/book/Harry-Potter/", title="Modern Marauders"))
getTitles <- function(returns) {

  noPages <- returns[[1]]
  page_use <- returns[[2]]
  page_num <- returns[[3]]
  url <- returns[[4]]
  title <- returns[[5]]

  foundIt <- 0

  scrapeEntries <- function(url, foundIt){
    entryHref<-character(0)
    for(i in 4:28){
      if(foundIt==1) break
      entry <- url %>% read_html() %>%
        html_nodes(xpath=paste('//*[@id="content_wrapper_inner"]/div[',i,']/a[1]')) %>%
        html_text()
      if(entry == title) {
        entryHref <- url %>% read_html() %>%
          html_nodes(xpath=paste('//*[@id="content_wrapper_inner"]/div[',i,']/a[1]')) %>%
          html_attr("href")
        foundIt <<-1
        return(entryHref)

      } else {
        entryHref <- entryHref
      }
    }
  }

  if(noPages==TRUE){
    entries <- scrapeEntries(url, foundIt)
    url <- "https://www.fanfiction.net"
    returns <- list(url=url, href=entries)
    return(returns)

  } else {

    for(i in 1:page_num) {
      if(foundIt == 0) entries <- scrapeEntries(paste0('https://www.fanfiction.net/',page_use,i), foundIt)
    }
    url <- "https://www.fanfiction.net"
    returns <- list(url=url, href=entries)
    return(returns)
  }
}
ekmaus19/absentfan documentation built on Nov. 20, 2019, 3:20 a.m.