data-raw/subreddits.R

# get and pre-process the subreddit data

library(dplyr)
library(ndjson)

# sudo apt-get install zstd

url <- "https://files.pushshift.io/reddit/subreddits/reddit_subreddits.ndjson.zst"

download.file(url, destfile = "data-raw/reddit_subreddits.ndjson.zst")

system2(command = "unzstd", args = "data-raw/reddit_subreddits.ndjson.zst")

subreddits_large <- ndjson::stream_in("data-raw/reddit_subreddits.ndjson") %>%
  dplyr::select(id, title, display_name, public_description, subscribers) %>%
  dplyr::as_tibble()

subreddits <- subreddits_large %>%
  dplyr::filter(!is.na(subscribers)) %>%
  dplyr::filter(subscribers > 1000) %>%
  dplyr::select(-id)

subreddits <- subreddits %>%
  dplyr::mutate(public_description = stringr::str_replace_all(public_description, "\\&amp;", "\\&"),
                title = stringr::str_replace_all(title, "\\&amp;", "\\&"))

usethis::use_data(subreddits, overwrite = TRUE)
BelangerAnalytics/socialastronomy.pbs4dash documentation built on Feb. 15, 2022, 8:06 a.m.