data-raw/datasets.R

# Load libraries.

library(dplyr)
library(lubridate)
library(stringr)

# Define helper functions.

# Define get_hashtags function.

get_hashtags <- Vectorize(function(string) {

   tryCatch({

      string <- str_extract_all(string, "#\\w+")

      string <- paste(unlist(string), collapse = ", ")

   }, warning = function(w) {

      print(paste("get_hashtags: ", w, sep = ""))

      return(NULL)

   }, error = function(e) {

      print(paste("get_hashtags: ", e, sep = ""))

      return(NULL)

   }, finally = {

   })

   return(string)

}, vectorize.args = "string")

# Define get_state_codes function.

get_state_codes <- function(states){

   tryCatch({

      codes <- paste(unlist(states$state_code), collapse = "|")

   }, warning = function(w) {

      print(paste("get_state_codes: ", w, sep = ""))

      return(NULL)

   }, error = function(e) {

      print(paste("get_state_codes: ", e, sep = ""))

      return(NULL)

   }, finally = {

   })

   return(codes)

}

# Define get_state_names function.

get_state_names <- function(states){

   tryCatch({

      names <- paste(unlist(states$state_name), collapse = "|")

   }, warning = function(w) {

      print(paste("get_state_names: ", w, sep = ""))

      return(NULL)

   }, error = function(e) {

      print(paste("get_state_names: ", e, sep = ""))

      return(NULL)

   }, finally = {

   })

   return(names)

}

# Create states data frame.

# Create character vector containing US state codes.

codes <- c("AL", "AS", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI",
           "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI",
           "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC",
           "ND", "NH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT",
           "VT", "VA", "WA", "WV", "WI", "WY", "DC", "AS", "GU", "MP", "PR",
           "VI")

# Create character vector containing US state names; order of state codes and
# state names should be consistent, e.g. AL = Alabama, AS = Alaska, and so on.

names <- c("Alabama",
           "Alaska",
           "Arizona",
           "Arkansas",
           "California",
           "Colorado",
           "Connecticut",
           "Delaware",
           "Florida",
           "Georgia",
           "Hawaii",
           "Idaho",
           "Illinois",
           "Indiana",
           "Iowa",
           "Kansas",
           "Kentucky",
           "Louisiana",
           "Maine",
           "Maryland",
           "Massachusetts",
           "Michigan",
           "Minnesota",
           "Mississippi",
           "Missouri",
           "Montana",
           "Nebraska",
           "Nevada",
           "New Hampshire",
           "New Jersey",
           "New Mexico",
           "New York",
           "North Carolina",
           "North Dakota",
           "Ohio",
           "Oklahoma",
           "Oregon",
           "Pennsylvania",
           "Rhode Island",
           "South Carolina",
           "South Dakota",
           "Tennessee",
           "Texas",
           "Utah",
           "Vermont",
           "Virginia",
           "Washington",
           "West Virginia",
           "Wisconsin",
           "Wyoming",
           "District of Columbia",
           "American Samoa",
           "Guam",
           "Northern Mariana Islands",
           "Puerto Rico",
           "US Virgin Islands")

# Combine vectors in data frame.

states <- data.frame(codes, names, "United States", stringsAsFactors = FALSE)

# Add variable names to the data frame.

colnames(states) <- c("state_code", "state_name", "country_name")

# Get list of source files in the extdata directory.

files <- list.files("./inst/extdata", full.names = TRUE)

# Create empty data frame.

source <- data.frame()

# Load source files from extdata directory one at a time; bind each file
# to the source data frame.

for (f in 1:length(files)) {

      d <- readRDS(files[f])

      source <- rbind(source, d)

      # Uncomment the following line to track progress via console.

      print(paste(f, files[f], "done", sep = " "))

      rm(d)

}

# Create tweets data frame.

tweets <- source %>%
      filter(is_retweet == FALSE & lang == "en" & verified == FALSE) %>%
      select(status_id,
             user_id,
             created_at,
             text) %>%
      mutate(created_at_date = as.Date(ymd_hms(created_at)),
             created_at_weekday = wday(created_at),
             created_at_hour = hour(created_at),
             hashtags = get_hashtags(text)) %>%
      distinct(status_id, .keep_all = TRUE)

# Save the tweets data frame to the data directory.

usethis::use_data(tweets, overwrite = TRUE)

# Create tweet count data frame.

tweet_count <- tweets %>%
      group_by(user_id) %>%
      summarise(tweet_count = n())

# Create users data frame.

users <- source %>%
      filter(is_retweet == FALSE & lang == "en" & verified == FALSE) %>%
      select(user_id,
             screen_name,
             followers_count,
             friends_count,
             favourites_count,
             account_created_at,
             name,
             location,
             description) %>%
      mutate(location = gsub("^[^A-Za-z]+$", replacement = "", location),
             location = if_else(location == "", NA_character_, location),
             location = str_squish(location),
             ff_percentage = round((followers_count / (followers_count + friends_count)), 2),
             account_age_in_years = round(year(Sys.Date()) - year(account_created_at), 2),
             account_created_at_year = year(account_created_at),
             state_code = str_extract(location, get_state_codes(states)),
             state_name = str_extract(location, get_state_names(states)),
             state_code = if_else(is.na(state_code),
                                  states$state_code[match(state_name, states$state_name)],
                                  state_code),
             state_name = if_else(is.na(state_name),
                                  states$state_name[match(state_code, states$state_code)],
                                  state_name)) %>%
      distinct(user_id, .keep_all = TRUE)

# Join the tweet count and users data frames.

users <- right_join(users, tweet_count, by = "user_id")

# Save the users data frame to the data directory.

usethis::use_data(users, overwrite = TRUE)
dtminnick/whistleblower documentation built on Nov. 14, 2019, 2:45 p.m.