R/update_facebook_pages.R

Defines functions update_pages update_page get_pageinfo

Documented in get_pageinfo update_page update_pages

#' Get additional information for pages
#'
#' @param page A page ID.
#' @param token Either a temporary access token created at
#'   https://developers.facebook.com/tools/explorer or the OAuth token created
#'   with fbOAuth.
#' @param fields The API fields that are requested. "fan_count" is the default.
#' @param api The API-version to use, e.g., "v2.8". NULL is the default.
#'
#' @return A named list where the page id and the additional fields are stored.
#' @export
get_pageinfo <- function(page, token, fields = "fan_count", api = NULL) {
  url <- paste0('https://graph.facebook.com/', page, '?fields=',
                paste(fields, collapse = ","))
  Rfacebook::callAPI(url = url, token = token, api = api)
}


#' Update or get posts from a Facebook page
#'
#' This function updates one Facebook page with \link{Rfacebook}'s getPage. It
#' looks for a previously stored data.frame (readable by readRDS, e.g.,
#' bbcnews.rds). If there is one, it tries to scrape newer (and older) posts
#' than the ones already stored in the data.frame. Otherwise it creats a new
#' data.frame.
#'
#' @param page A page name or ID.
#' @param token Either a temporary access token created at
#'   https://developers.facebook.com/tools/explorer or the OAuth token created
#'   with fbOAuth.
#' @param datafile The full path to a RDS data file.
#' @param go_back Go back in time, not only update newer posts, but also get
#'   older ones. Default is TRUE.
#' @param n_posts The number of posts to scrape from the page, defaults to 100.
#' @param feed If TRUE, the function will also return posts on the page that
#'   were made by others (not only the admin of the page).
#' @param reactions If TRUE, will add variables to the data frame with the total
#'   count of reactions: love, haha, wow, sad, angry.
#' @param max_repeats The number of repeated calls to scrape data backwards. If
#'   go_back is TRUE, there will be repeated calls to go back in time (each call
#'   gets the number of posts in n_posts). The backward scraping will stop after
#'   max_repeats is reached. Defaults to 100.
#' @param debug If TRUE, more information will be printed.

#'
#' @return TRUE if it ran through.
#' @importFrom rlang .data
#' @export
#'
#' @examples
#' \dontrun{
#' # You have to authenticate with Facebook's API first. For more in this visit
#' # Rfacebooks documentation.
#' fb_oauth <- Rfacebook::fbOAuth(app_id="123456789",
#'                                app_secret="1A2B3C4D",
#'                                extended_permissions = TRUE)
#'
#' # The BBC news page's posts are updated and stored in the user's home
#' # folder in the directory "temp" as "bbc.rds". If the file does not already
#' # exist a new file is created
#' update_page("bbcnews", fb_oauth, "~/temp/bbc.rds")
#'
#' # The data set can be loaded with readRDS
#' readRDS("~/temp/bbc.rds")
#' }
update_page <- function(page, token, datafile, go_back = TRUE,
                        n_posts = 100, feed = FALSE, reactions = FALSE,
                        max_repeats = 100, debug = FALSE) {

  message(paste0("### Updating Facebook posts for ", page, "."))
  existing_posts <- 0

  # Check parameters
  if (missing(page)) {
    stop("Parameter page is missing, please provide a Facebook page ID or name.")
  }
  if (missing(datafile)) {
    stop("Parameter datafile is missing, please provide a file path.")
  }
  if (missing(token)) {
    stop("Parameter token is missing, please provide a Facebook auth token.")
  }

  # Run for the first time or get new posts until the previously newest post
  # is reached (or initialize)
  if (file.exists(datafile)) {

    message(paste0(datafile, " found. Updating data ..."))
    old_data <- readRDS(datafile)
    existing_posts <- nrow(old_data)

    # If scrape_time (data from old versions) does not exist, add empty column
    if (!any(colnames(old_data) == "scrape_time")) {
      old_data[, "scrape_time"] <- as.POSIXct(character(0))
    }

    # If fan_count (data from old versions) does not exist, add empty column
    if (!any(colnames(old_data) == "fan_count")) {
      old_data[, "fan_count"] <- as.numeric(character(0))
    }

    # If reactions = TRUE but do no exist, add empty columns
    if (reactions) {
      if (!any(colnames(old_data) == "love_count")) {
        old_data[, "love_count"] <- as.numeric(character(0))
      }
      if (!any(colnames(old_data) == "haha_count")) {
        old_data[, "haha_count"] <- as.numeric(character(0))
      }
      if (!any(colnames(old_data) == "wow_count")) {
        old_data[, "wow_count"] <- as.numeric(character(0))
      }
      if (!any(colnames(old_data) == "sad_count")) {
        old_data[, "sad_count"] <- as.numeric(character(0))
      }
      if (!any(colnames(old_data) == "angry_count")) {
        old_data[, "angry_count"] <- as.numeric(character(0))
      }
    }

    var_order <- current_facebook_sort()
    # Reorder variables
    if (!identical(names(old_data), var_order)) {
      old_data <- sort_data(data = old_data)
    }
    # # oldest post
    # oldest <- substr(min(old_data$created_time), 1, 10)
    # # newest post
    # newest <- substr(max(old_data$created_time), 1, 10)
    # # Issue 158: https://github.com/pablobarbera/Rfacebook/issues/158
    # #newest <- as.numeric(lubridate::ymd_hms(max(page_data$created_time)))

    data <- tryCatch(
      {
        Rfacebook::getPage(page = page, token = token, n = n_posts,
                           reactions = reactions, feed = feed)
      # }, warning = function(w) {
      #   message(paste0("A warning occured while calling getPage: ", w))
      #   NULL # Return NULL
      }, error = function(e) {
        message(paste0("An error occured while calling getPage: ", e))
        NULL # Return NULL
      })
    if (is.null(data)) { return(FALSE) } # Stop function and return FALSE
    if (nrow(data) < 1) {
      message("No posts downloadable.")
      return(FALSE)
    }
    # If reactions are TRUE Rfacebook sorts the data ascendingly (oldest post
    # first). I now by default rearrange the downloaded data here:
    data <- dplyr::arrange(
      data,
      dplyr::desc(.data$created_time),
      dplyr::desc(.data$id)
    )
    data[, "scrape_time"] <- Sys.time()
    data[, "fan_count"] <-
      get_pageinfo(page = page, token = token)[["fan_count"]]

    data <- dplyr::arrange(
        data,
        dplyr::desc(.data$created_time),
        dplyr::desc(.data$scrape_time),
        .data$id
    )

    if (debug) {
      message(paste0("DEBUG: Newest post from pre-existing data: ",
                     max(old_data$created_time)))
    }
    if (debug) {
      message(paste0("DEBUG: Oldest post from new data: ",
                     min(data$created_time)))
    }

    # Hash for data downloaded in first run.
    hash <- digest::digest(data, algo = "md5")
    # Get new data until newest date from pre-existing data is reached
    repeat_counter <- 0
    while (min(data$created_time) > max(old_data$created_time)) {

      message(paste0("Newer data found for ", page, ". Downloading ", n_posts,
                     " new posts."))

      oldest_in_data <- substr(min(data$created_time), 1, 10)
      oldest_in_data <- lubridate::date(oldest_in_data) + 1
      oldest_in_data <- as.character(oldest_in_data)

      if (debug) {
        message(paste0("DEBUG: Oldest date for updating new posts: ", oldest_in_data))
      }

      new_data <- tryCatch(
        {
          Rfacebook::getPage(page, token = token, n = n_posts,
                             reactions = reactions, feed = feed,
                             until = oldest_in_data)
        # }, warning = function(w) {
        #   message(paste0("A warning occured while calling getPage: ", w))
        #   NULL # Return NULL
        }, error = function(e) {
          message(paste0("An error occured while calling getPage: ", e))
          NULL # Return NULL
        })
      if (is.null(new_data)) {
        message("No downloadable data found while trying to get more new data.")
        break
      } # Stop loop
      if (nrow(new_data) < 1) {
        message("No more new posts downloadable.")
        break
      }
      # If reactions are TRUE Rfacebook sorts the data ascendingly (oldest post
      # first). I now by default rearrange the downloaded data here:
      new_data <- dplyr::arrange(
        new_data,
        dplyr::desc(.data$created_time),
        dplyr::desc(.data$id)
      )
      new_data[, "scrape_time"] <- Sys.time()
      new_data[, "fan_count"] <-
        get_pageinfo(page = page, token = token)[["fan_count"]]

      repeat_counter <- repeat_counter + 1
      # Debug messages
      if (debug) {
        message(paste0("DEBUG: Loop for getting older data ran for ",
                       repeat_counter, " time(s)."))
        message(paste0("DEBUG: Previously oldest ID is ",
                       data[nrow(data), c("id")],
                       "."))
        message(paste0("DEBUG: Previously oldest post is from ",
                       min(data$created_time), "."))
        message(paste0("DEBUG: Now downloaded oldest ID is ",
                       new_data[nrow(new_data), c("id")], "."))
        message(paste0("DEBUG: Now downloaded oldest post is from ",
                       min(new_data$created_time), "."))
      }

      # hash from previous run is set as previous
      previous_hash <- hash
      # hash from this run is computed
      hash <- digest::digest(new_data, algo = "md5")

      # Check if retrieved data did not change after last loop
      if (hash == previous_hash)
      {
        message(paste0("Loop to get all newer data produced the same result twice. The previously newest Tweet is therfore not reachable (likely because it was deleted). Emergency break!"))
        break
      }

      data <- dplyr::bind_rows(new_data, data)
      data <- dplyr::distinct(data, .data$id, .keep_all = TRUE)
      data <- dplyr::arrange(
        data,
        dplyr::desc(.data$created_time),
        dplyr::desc(.data$scrape_time),
        .data$id
      )
    }

    # Combine new data with previously retrieved data
    data <- dplyr::bind_rows(data, old_data)
    data <- dplyr::distinct(data, .data$id, .keep_all = TRUE)
    data <- dplyr::arrange(
      data,
      dplyr::desc(.data$created_time),
      dplyr::desc(.data$scrape_time),
      .data$id
    )

    # else: Run getPage for the first time
  } else {

    message(paste0(datafile, " not found. First run for ", page, " ..."))
    data <- tryCatch(
      {
        Rfacebook::getPage(page = page, token = token, n = n_posts,
                           reactions = reactions, feed = feed)
      # }, warning = function(w) {
      #   message(paste0("A warning occured while calling getPage: ", w))
      #   NULL # Return NULL
      }, error = function(e) {
        message(paste0("An error occured while calling getPage: ", e))
        NULL # Return NULL
      })
    if (is.null(data)) { return(FALSE) } # Stop function and return FALSE
    if (nrow(data) < 1) {
      message("No posts downloadable.")
      return(FALSE)
    }
    # If reactions are TRUE Rfacebook sorts the data ascendingly (oldest post
    # first). I now by default rearrange the downloaded data here:
    data <- dplyr::arrange(
      data,
      dplyr::desc(.data$created_time),
      dplyr::desc(.data$id)
    )
    data[, "scrape_time"] <- Sys.time()
    data[, "fan_count"] <-
      get_pageinfo(page = page, token = token)[["fan_count"]]
  }

  # Get older posts
  if (go_back) {
    message("Try to get older data.")
    repeat_counter <- 0
    oldest_id <- data[nrow(data), c("id")]
    hash <- ""

    repeat {
      # counter is increased by 1
      repeat_counter <- repeat_counter + 1

      # Date of oldest post in existing data
      oldest <- substr(min(data$created_time), 1, 10)
      oldest <- lubridate::date(oldest)+1
      oldest <- as.character(oldest)
      # Issue 158: https://github.com/pablobarbera/Rfacebook/issues/158
      # oldest <- as.numeric(lubridate::ymd_hms(min(page_data$created_time)))

      if (debug) {
        message(paste0("DEBUG: Oldest post for getting older posts: ", oldest))
      }

      # Get older posts
      older_data <- tryCatch(
        {
          Rfacebook::getPage(page, token = token, n = n_posts,
                             reactions = reactions, feed = feed,
                             until = oldest)
        # }, warning = function(w) {
        #   message(paste0("A warning occured while calling getPage: ", w))
        #   NULL # Return NULL
        }, error = function(e) {
          message(paste0("An error occured while calling getPage: ", e))
          NULL # Return NULL
        })
      if (is.null(older_data)) {
        # break loop if an error occurred
        break
      }
      if (nrow(older_data) < 1) {
        message("No older posts downloadable.")
        break
      }
      # If reactions are TRUE Rfacebook sorts the data ascendingly (oldest post
      # first). I now by default rearrange the downloaded data here:
      older_data <- dplyr::arrange(
        older_data,
        dplyr::desc(.data$created_time),
        dplyr::desc(.data$id)
      )
      older_data[, "scrape_time"] <- Sys.time()
      older_data[, "fan_count"] <-
        get_pageinfo(page = page, token = token)[["fan_count"]]

      if (debug) {
        message(paste0("DEBUG: Number of retrieved older posts: ",
                       nrow(older_data)))
      }

      # oldest ID from existing data or previous run is set as previous
      previous_oldest_id <- oldest_id
      # oldest ID from this run
      oldest_id <- older_data[nrow(older_data), c("id")]
      # hash from previous run is set as previous
      previous_hash <- hash
      # hash from this run is computed
      hash <- digest::digest(older_data, algo = "md5")

      # Debug messages
      if (debug) {
        message(paste0("DEBUG: Loop for getting older data ran for ",
                       repeat_counter, " time(s)."))
        message(paste0("DEBUG: Previously oldest ID is ", previous_oldest_id,
                       "."))
        message(paste0("DEBUG: Previously oldest post is from ",
                       min(data$created_time), "."))
        message(paste0("DEBUG: Now downloaded oldest ID is ", oldest_id, "."))
        message(paste0("DEBUG: Now downloaded oldest post is from ",
                       min(older_data$created_time), "."))
      }

      # No more older tweets returned: break loop
      if (oldest_id == previous_oldest_id) {
        message(paste0("Oldest post downloaded for ", page, " is from ",
                       min(older_data$created_time), "."))
        break
      }

      # Check if retrieved data did not change after last loop
      if (hash == previous_hash)
      {
        message(paste0("Loop produced the same result twice. Emergency break!"))
        break
      }

      message(paste0("There is still older data. Adding up to ", n_posts,
                     " old posts"))

      # Combine older data with existing data
      data <- dplyr::bind_rows(data, older_data)
      data <- dplyr::distinct(data, .data$id, .keep_all = TRUE)
      data <- dplyr::arrange(
        data,
        dplyr::desc(.data$created_time),
        dplyr::desc(.data$scrape_time),
        .data$id
      )

      # Break if max_repeats are reached
      if (repeat_counter >= max_repeats) {
        message("Loop ran for ", repeat_counter,
                " time(s). Max. number of repeats has been reached!")
        break
      }
    }
  }
  # Save updated data
  added_posts <- nrow(data) - existing_posts
  message(paste0(added_posts, " posts downloaded for ", page, "."))
  message(paste0("### Saving retrieved data for ", page, "."))
  saveRDS(data, file = datafile)
  return(TRUE)
}


#' Update multiple Facebook pages
#'
#' This function updates multiple Facebook pages with \link{Rfacebook}'s
#' getPage. makes use of the \link{update_page} function in this package. For
#' each page in the provided character vector pages it looks for a previously
#' stored data.frame (stored in the datadir under the name provided in the named
#' vector pages with the file extension .rds). If there is one, it tries to
#' scrape newer (and older) posts than the ones already stored in the
#' data.frame. Otherwise it creats a new data.frame.
#'
#' @param pages A named character vector. Names are the data file names,
#'   characters are the page names that will be updated (see example).
#' @param datadir A directory containing the stored data sets (as "name.rds").
#' @param token Either a temporary access token created at
#'   https://developers.facebook.com/tools/explorer or the OAuth token created
#'   with fbOAuth.
#' @param go_back Go back in time, not only update newer posts, but also get
#'   older ones. Default is TRUE.
#' @param n_posts The number of posts to scrape from each page, defaults to 100.
#' @param feed If TRUE, the function will also return posts from each page that
#'   were made by others (not only the admin of the page).
#' @param reactions If TRUE, will add variables to the data frame with the total
#'   count of reactions: love, haha, wow, sad, angry.
#' @param max_repeats The number of repeated calls to scrape data backwards. If
#'   go_back is TRUE, there will be repeated calls to go back in time (each call
#'   gets the number of posts in n_posts). The backward scraping will stop after
#'   max_repeats is reached. Defaults to 100.
#' @param debug If TRUE, more information will be printed.
#' @return TRUE for all pages the update went through.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # You have to authenticate with Facebook's API first. For more in this visit
#' # Rfacebooks documentation.
#' fb_oauth <- Rfacebook::fbOAuth(app_id="123456789",
#'                                app_secret="1A2B3C4D",
#'                                extended_permissions = TRUE)
#'
#' # A character vector is created
#' my_pages <- c(cnn = "cnn",
#'               bbc = "bbcnews")
#' # The two pages CNN and BBC news are updated and the results are stored in
#' # the user's home directory in the folder "temp". They are named cnn.rds and
#' # bbc.rds. If they do not already exist, both data sets are created.
#' update_pages(pages = my_pages, token = fb_outh, datadir = "~/temp")
#' }
update_pages <- function(pages = NULL, token = NULL, datadir = "./data",
                         go_back = TRUE, n_posts = 100, feed = FALSE,
                         reactions = FALSE, max_repeats = 100, debug = FALSE) {
  # Checking parameters
  finished <- FALSE
  if (is.null(pages)) {
    stop("No (named) character vector containing Facebook pages provided.")
  }
  # Creating data directory if necessary
  if (!dir.exists(datadir)) {
    message(paste0(datadir," does not exist. Creating a new directory ..."))
    dir.create(datadir)
  }
  # Reading data set names and creating full paths to files
  datafiles <- names(pages)
  if (is.null(datafiles)) {
    stop("Provided character vector is not named. See documentation for an
         example.")
  }
  datafiles <- file.path(datadir, paste0(datafiles, ".rds"))
  # Run update for every page
  finished <- purrr::map2(pages, datafiles, update_page,
                          token = token, n_posts = n_posts, go_back = go_back,
                          feed = feed, reactions = reactions,
                          max_repeats = max_repeats, debug = debug)
  return(finished)
}
jogrue/RfacebookHelperFunctions documentation built on May 7, 2020, 2:03 p.m.