R/selenium_fun.R

Defines functions remDr_go_to_link get_page_source do_remDr connect_remDr start_chrome_remDr

Documented in connect_remDr do_remDr get_page_source remDr_go_to_link start_chrome_remDr

# Start remote driver with docker ---------------------------

#' Start remote driver with docker using the system command line.
#'
#' @param kill Bolean indicating weather a runing docker container with selenium
#' should be killed.
#' @import RSelenium
#' @importFrom utils capture.output

start_chrome_remDr <- function(kill = FALSE) {
  ps_out <- system("docker ps", intern = TRUE)

  capt_ps_out <- capture.output(cat(ps_out))

  container_running <- stringr::str_detect(
    capt_ps_out,
    pattern = "selenium/standalone-chrome")

  if(kill == TRUE  | container_running == TRUE) {
    cat("selenium/standalone-chrome docker container already runnign \n")
    cat("kill container and restart a new one \n")
    selenium_cont <- ps_out[which(
      stringr::str_detect(
        ps_out,
        pattern = "selenium/standalone-chrome") == TRUE)]

    docker_id <- stringr::str_split(
      selenium_cont,
      pattern = stringr::regex( "[[:space:]]" ))[[1]][1]

    system(stringr::str_c("docker kill ", docker_id))
    Sys.sleep(2)
  }

  cat("starting selenium/standalone-chrome docker container \n")
  system("docker run -d -p 4445:4444 --shm-size 2g selenium/standalone-chrome")
  Sys.sleep(2)

}

# connect to remote driver ---------------------------

#' Connect to remote driver
#'
#' This function assumes a docker container was proviously started using
#' \code{\link{start_chrome_remDr}}. It connects R to the docker container and
#' returns a remote driver object class which is used to interact with websites.
#'
#' @return Object class remote driver.
#' @import RSelenium

connect_remDr <- function() {
  remDr <- RSelenium::remoteDriver(
    remoteServerAddr = "localhost",
    port = 4445L,
    browserName = "chrome")
  return(remDr)
}

# Wrapper function to interact with remote driver ---------------------------

#' Wrapper function to interact with remote driver
#'
#' Wrapper function to do operations with the remote driver. Allows the use of
#' user defined function for a specic link. The idea, is to make the process of
#' remotely interacting with websites, as flexible as possible so that I can
#' pass any function of interest to the scrape or do operations on the
#' link specified.
#'
#' @param remDr remote driver connection object created with
#' \code{\link{connect_remDr}}.
#' @param link url for the website to intereact with.
#' @param FUN user-specified functon
#' @param FUN_input input that will go in \code{FUN}.
#' @return Object generated by \code{FUN}.
#'
#' @import RSelenium

do_remDr <- function(remDr, link, FUN, FUN_input) {
  # Wrapper to start werdriver, oper server connection, move the desired website, do something and close the website in one environment.

  # open server connection
  remDr$open(silent = TRUE)

  # set a timeout
  remDr$setTimeout(type = "Implicit", milliseconds = 5000)

  # navigate to the website
  remDr$navigate(url = link)
  Sys.sleep(2)

  # function to do something on the url
  FUN_output <- FUN(FUN_input)

  # close connection
  remDr$close()

  return(FUN_output)
}


# Get page source ---------------

#' Get htlm page source
#'
#' Function to instruct loaded remote driver to connect to url and download page
#' source. Note that indiviual download of source html wihthin each specific
#' query is handled diferently.
#'
#' @param remDr remote driver connection
#' @param link link to the desired webpage
#'
#' @return parsed html file.

get_page_source <- function(remDr, link) {

  # open server connection
  # remDr$open(silent = TRUE)

  # set a timeout
  remDr$setTimeout(type = "Implicit", milliseconds = 5000)

  # navigate to the website
  # remDr$navigate(url = link)
  Sys.sleep(3)

  # function to do something on the url
  source_html <- remDr$getPageSource()[[1]]
  parsed_html <- xml2::read_html(source_html)
  return(parsed_html)
}

# Connect to Link -----

#' Connect remote driver to specified url
#'
#' @param remDr remote driver connection
#' @param link link to the desired webpage
#'
#' @return no return
#'
#' @import RSelenium

remDr_go_to_link <-
  function(remDr, link) {
    # open remDr
    # remDr$open(silent = TRUE)

     # set a timeout
    remDr$setTimeout(type = "Implicit", milliseconds = 5000)

    # navigate to the website
    remDr$navigate(url = link)
}
# open server connection
tijoalca/pigeonscraper documentation built on Sept. 2, 2021, 9:48 a.m.