R/Collect.reddit.R
In vosonSML: Collecting Social Media Data and Generating Networks for Analysis

Documented in Collect.reddit

#' @title Collect comments data from reddit threads
#'
#' @description Collects comments made by users on one or more specified subreddit conversation threads and structures
#'   the data into a dataframe with the class names \code{"datasource"} and \code{"reddit"}.
#'
#' @note The reddit web endpoint used for collection has maximum limit of 500 comments per thread url.
#'
#' @param credential A \code{credential} object generated from \code{Authenticate} with class name \code{"reddit"}.
#' @param threadUrls Character vector. Reddit thread urls to collect data from.
#' @param waitTime Numeric vector. Time range in seconds to select random wait from in-between url collection requests.
#'   Minimum is 3 seconds. Default is \code{c(3, 5)} for a wait time chosen from between 3 and 5 seconds.
#' @param ua Character string. Override User-Agent string to use in Reddit thread requests. Default is
#'   \code{option("HTTPUserAgent")} value as set by vosonSML.
#' @param writeToFile Logical. Write collected data to file. Default is \code{FALSE}.
#' @param verbose Logical. Output additional information about the data collection. Default is \code{TRUE}.
#' @param ... Additional parameters passed to function. Not used in this method.
#'
#' @return A \code{tibble} object with class names \code{"datasource"} and \code{"reddit"}.
#'
#' @examples
#' \dontrun{
#' # subreddit url to collect threads from
#' threadUrls <- c("https://www.reddit.com/r/xxxxxx/comments/xxxxxx/x_xxxx_xxxxxxxxx/")
#'
#' redditData <- redditAuth |>
#'   Collect(threadUrls = threadUrls, writeToFile = TRUE)
#' }
#'
#' @export
Collect.reddit <-
  function(credential,
           threadUrls,
           waitTime = c(3, 5),
           ua = getOption("HTTPUserAgent"),
           writeToFile = FALSE,
           verbose = FALSE,
           ...) {

    msg("Collecting comment threads for reddit urls...\n")

    if (missing(threadUrls)) {
      stop("Please provide a vector of one or more reddit thread urls.", call. = FALSE)
    }

    invisible(check_chr(threadUrls, param = "threadUrls"))

    # some protection against spamming requests
    def_wait <- list(min = 3, max = 10)
    if (!is.numeric(waitTime)) {
      stop(
        "Please provide a numeric range as vector c(min, max) for reddit thread request waitTime parameter.",
        call. = FALSE
      )
    } else {
      if (length(waitTime) == 1) waitTime <- c(def_wait$min, waitTime[1])
      if (length(waitTime) != 2) waitTime <- c(waitTime[1], waitTime[2])

      if (waitTime[1] < def_wait$min) waitTime[1] <- def_wait$min
      if (waitTime[1] >= waitTime[2]) waitTime[2] <- waitTime[1] + 1

      waitTime <- waitTime[1]:waitTime[2]
    }

    if (verbose) {
      msg(
        paste0(
          "Waiting between ",
          waitTime[1],
          " and ",
          waitTime[length(waitTime)],
          " seconds per thread request.\n"
        )
      )
    }

    threads_df <- NULL

    tryCatch({
      threads_df <- reddit_build_df(threadUrls, waitTime, ua, verbose, msg = msg)
    }, error = function(e) {
      stop(gsub("^Error:\\s", "", paste0(e)), call. = FALSE)
    }, finally = {
      threads_df <- tibble::as_tibble(threads_df)
    })

    if (!is.null(threads_df)) {
      if (nrow(threads_df) > 0) {
        msg("HTML decoding comments.\n")
        threads_df$comment <-
          textutils::HTMLdecode(threads_df$comment)

        # summary
        results_df <- threads_df |>
          dplyr::group_by(.data$thread_id) |>
          dplyr::summarise(
            title = paste0(unique(.data$title), collapse = ","),
            subreddit = paste0(unique(.data$subreddit), collapse = ","),
            count = dplyr::n()
          ) |>
          dplyr::ungroup()

        results_df$title <-
          ifelse(nchar(results_df$title) > 42,
                 paste0(strtrim(results_df$title, 42), "..."),
                 results_df$title)
        msg(print_summary(results_df))
        msg(paste0("Collected ", nrow(threads_df), " total comments.\n"))
      } else {
        msg(paste0("No comments were collected.\n"))
      }
    } else {
      msg(paste0("Collection dataframe is null.\n"))
    }

    class(threads_df) <- append(c("datasource", "reddit"), class(threads_df))
    if (writeToFile) {
      write_output_file(threads_df, "rds", "RedditData", verbose = verbose)
    }

    msg("Done.\n")

    threads_df
  }

reddit_build_df <- function(threadUrls, waitTime, ua, verbose, msg = msg) {
  threads <- lapply(threadUrls, function(x) {
    if (length(threadUrls) > 1 & (x != threadUrls[1])) {
      Sys.sleep(sample(waitTime, 1))
    }

    thread_json <-
      reddit_data(x,
                  wait_time = waitTime,
                  ua = ua,
                  verbose = verbose,
                  msg = msg)
    branch_df <- reddit_content_plus(thread_json, x, msg = msg)

    # loop protection
    prev_value <- NULL

    extra_threads <-
      dplyr::filter(branch_df, grepl("Listing:", .data$comm_id))
    while (nrow(extra_threads) > 0) {
      row_i <- 1 # top row

      # loop protection
      if (!is.null(prev_value) &&
          extra_threads[row_i, "comm_id"] == prev_value) {
        msg("Loop protection following continue threads. Exiting.\n")
        break
      }
      prev_value <- extra_threads[row_i, "comm_id"]

      Sys.sleep(sample(waitTime, 1))

      # get continue thread comment position info
      cont_index <- as.numeric(extra_threads[row_i, "id"])
      depth <-
        as.numeric(gsub(".*_(\\d)_\\d$", "\\1", extra_threads[row_i, "structure"]))
      struct <-
        gsub("_\\d_\\d$", "", extra_threads[row_i, "structure"])
      cont_thread_id <-
        gsub("Listing:t1_", "", extra_threads[row_i, "comm_id"])

      # set continue thread comment rm flag to true
      branch_df <- dplyr::mutate(branch_df,
                                 rm = ifelse((
                                   .data$comm_id == cont_thread_id |
                                     .data$comm_id == extra_threads[row_i, "comm_id"]
                                 ),
                                 TRUE,
                                 .data$rm
                                 ))

      # get continue thread
      cont_json <-
        reddit_data(paste0(x, cont_thread_id),
                    waitTime,
                    ua,
                    cont = cont_thread_id,
                    verbose = verbose)
      cont_df <- reddit_content_plus(cont_json, x, depth = depth, msg = msg)

      # if comments returned
      if (nrow(cont_df)) {
        cont_df <-
          cont_df |> dplyr::mutate(structure = paste0(struct, "_", .data$structure)) # append structure

        # insert new comments into thread dataframe using position
        if (cont_index == 1) {
          branch_df <- dplyr::bind_rows(cont_df, branch_df)
        } else {
          pre_df <- dplyr::bind_rows(branch_df[1:cont_index - 1, ], cont_df)
          branch_df <-
            dplyr::bind_rows(pre_df, branch_df[cont_index:nrow(branch_df), ])
        }
      }

      extra_threads <- extra_threads[-row_i, ] # not needed
      extra_threads <-
        dplyr::filter(branch_df,
                      grepl("Listing:", .data$comm_id),
                      .data$rm == FALSE)
    } # end while

    if (!is.null(branch_df) && nrow(branch_df) > 0) {
      branch_df$thread_id <-
        gsub(
          "^(.*)?/comments/([0-9A-Za-z]{6})?/{0,1}(.*)?/{0,1}$",
          "\\2",
          branch_df$url,
          ignore.case = TRUE,
          perl = TRUE,
          useBytes = TRUE
        ) # extract thread id
      branch_df <- branch_df |>
        dplyr::filter(.data$rm == FALSE) |> # remove continue thread entries
        dplyr::arrange(.data$thread_id, .data$id)

      branch_df$id <- seq_along(branch_df$id) # re-index
    }

    branch_df
  })

  threads_df <- dplyr::bind_rows(threads)

  threads_df
}

# based on method by @ivan-rivera RedditExtractoR
reddit_data <-
  function(url,
           wait_time,
           ua,
           cont = NULL,
           verbose = TRUE,
           msg = msg) {

    if (is.null(url) || length(url) == 0 || !is.character(url)) {
      stop("invalid URL parameter")
    }

    if (!grepl("^https?://(.*)", url)) {
      url <-
        paste0("https://www.", gsub("^.*(reddit\\..*$)", "\\1", url))
    }
    if (!grepl("\\?ref=search_posts$", url)) {
      url <- paste0(gsub("/$", "", url), "/?ref=search_posts")
    }

    req_url <-
      paste0(gsub("\\?ref=search_posts$", "", url),
             ".json?limit=500&raw_json=1")
    req_tid <- get_thread_id(req_url, TRUE)

    if (is.null(cont)) {
      msg(paste0("Request thread: ", req_tid, "\n"))
    } else {
      if (verbose) {
        msg(paste0("Continue thread: ", req_tid, " - ", cont, "\n"))
      }
    }

    req_data <- get_json(req_url, ua) # ua

    if (is.null(req_data$data)) {
      Sys.sleep(sample(wait_time, 1))

      if (is.null(cont)) {
        msg(paste0("Retry thread: ", req_tid, "\n"))
      } else {
        if (verbose) {
          msg(paste0("Retry continue thread: ", req_tid, " - ", cont, "\n"))
        }
      }

      req_data <- get_json(req_url, ua) # ua
    }

    if (is.null(req_data$status) ||
        as.numeric(req_data$status) != 200) {
      msg(paste0("Failed: ", url,
                 ifelse(
                   is.null(req_data$status),
                   "",
                   paste0(" (", req_data$status, ")")
                 ),
                 "\n"))
    }

    req_data$data
  }

# based on method by @ivan-rivera RedditExtractoR
reddit_values_list  <- function(node, feature) {
  attr <- node$data[[feature]]
  if (is.null(attr)) {
    attr <- NA
  }
  if (feature == "id") {
    if (attr == "_") {
      attr <- paste0("Listing:", node$data$parent_id)
    }
  }
  replies <- node$data$replies
  reply_nodes <- NULL
  if (is.list(replies)) {
    reply_nodes <- replies$data$children
  }

  attrs <- list(attr,
                lapply(reply_nodes, function(x) {
                  reddit_values_list(x, feature)
                }))

  attrs
}

# based on method by @ivan-rivera RedditExtractoR
reddit_struct_list <- function(node, depth = 0) {
  if (is.null(node)) {
    return(list())
  }

  reply_nodes <- NULL
  replies <- node$data$replies
  if (is.list(replies)) {
    reply_nodes <- replies$data$children
  }

  # depth is converted to char to prevent col type to integer with no-depth threads
  structures <- list(as.character(depth),
                     lapply(1:length(reply_nodes), function(x) {
                       reddit_struct_list(reply_nodes[[x]], paste0(depth, "_", x))
                     }))

  structures
}

# based on method by @ivan-rivera RedditExtractoR
reddit_content_plus <- function(raw_data, req_url, depth = 0, msg = msg) {
  data_extract <- tibble::tibble(
    id = numeric(),
    structure = character(),
    post_date = character(),
    post_date_unix = numeric(),
    comm_id = character(),
    comm_date = character(),
    comm_date_unix = numeric(),
    num_comments = numeric(),
    subreddit = character(),
    upvote_prop = numeric(),
    post_score = numeric(),
    author = character(),
    user = character(),
    comment_score = numeric(),
    controversiality = numeric(),
    comment = character(),
    title = character(),
    post_text = character(),
    link = character(),
    domain = character(),
    url = character(),
    rm = logical()
  )

  if (is.null(raw_data)) {
    return(data_extract)
  }

  meta_node <- raw_data[[1]]$data$children[[1]]$data
  main_node <- raw_data[[2]]$data$children

  if (min(length(meta_node), length(main_node)) > 0) {
    structures_list <- unlist(lapply(1:length(main_node), function(x) {
      reddit_struct_list(main_node[[x]], depth = ifelse(depth != 0, depth, x))
    }))

    data <- tibble::tibble(
      id               = NA,
      structure        = structures_list,
      post_date        = as.character(lubridate::as_datetime(
        as.numeric(meta_node$created_utc), tz = "UTC"
      )),
      post_date_unix   = as.numeric(meta_node$created_utc),
      comm_id          = unlist(lapply(main_node, function(x) {
        reddit_values_list(x, "id")
      })),
      comm_date        = as.character(lubridate::as_datetime(as.numeric(
        unlist(lapply(main_node, function(x) {
          reddit_values_list(x, "created_utc")
        }))
      ), tz = "UTC")),
      comm_date_unix   = as.numeric(unlist(lapply(main_node, function(x) {
        reddit_values_list(x, "created_utc")
      }))),
      num_comments     = meta_node$num_comments,
      subreddit        = ifelse(
        is.null(meta_node$subreddit),
        "UNKNOWN",
        meta_node$subreddit
      ),
      upvote_prop      = meta_node$upvote_ratio,
      post_score       = meta_node$score,
      author           = meta_node$author,
      user             = unlist(lapply(main_node, function(x) {
        reddit_values_list(x, "author")
      })),
      comment_score    = unlist(lapply(main_node, function(x) {
        reddit_values_list(x, "score")
      })),
      controversiality = unlist(lapply(main_node, function(x) {
        reddit_values_list(x, "controversiality")
      })),
      comment          = unlist(lapply(main_node, function(x) {
        reddit_values_list(x, "body")
      })),
      title            = meta_node$title,
      post_text        = meta_node$selftext,
      link             = meta_node$url,
      domain           = meta_node$domain,
      url              = req_url,
      rm               = FALSE
    )

    data$id <- 1:nrow(data)

    if (dim(data)[1] > 0 && dim(data)[2] > 0) {
      data_extract <- rbind(data, data_extract)
    } else {
      msg(paste0("No data: ", req_url, "\n"))
    }
  }

  data_extract
}

Any scripts or data that you put into this service are public.

vosonSML documentation built on Aug. 16, 2022, 5:14 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

vosonSML
Collecting Social Media Data and Generating Networks for Analysis

R/Collect.reddit.R
In vosonSML: Collecting Social Media Data and Generating Networks for Analysis

Defines functions reddit_content_plus reddit_struct_list reddit_values_list reddit_data reddit_build_df Collect.reddit

Documented in Collect.reddit

Try the vosonSML package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

vosonSML Collecting Social Media Data and Generating Networks for Analysis

R/Collect.reddit.R In vosonSML: Collecting Social Media Data and Generating Networks for Analysis

Defines functions reddit_content_plus reddit_struct_list reddit_values_list reddit_data reddit_build_df Collect.reddit

Documented in Collect.reddit

Try the vosonSML package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

vosonSML
Collecting Social Media Data and Generating Networks for Analysis

R/Collect.reddit.R
In vosonSML: Collecting Social Media Data and Generating Networks for Analysis