whistleblower: Twitter Whistleblower Data Analysis

# Load libraries.

library(dplyr)
library(lubridate)
library(stringr)

# Define helper functions.

# Define get_hashtags function.

get_hashtags <- Vectorize(function(string) {

   tryCatch({

      string <- str_extract_all(string, "#\\w+")

      string <- paste(unlist(string), collapse = ", ")

   }, warning = function(w) {

      print(paste("get_hashtags: ", w, sep = ""))

      return(NULL)

   }, error = function(e) {

      print(paste("get_hashtags: ", e, sep = ""))

      return(NULL)

   }, finally = {

   })

   return(string)

}, vectorize.args = "string")

# Define get_state_codes function.

get_state_codes <- function(states){

   tryCatch({

      codes <- paste(unlist(states$state_code), collapse = "|")

   }, warning = function(w) {

      print(paste("get_state_codes: ", w, sep = ""))

      return(NULL)

   }, error = function(e) {

      print(paste("get_state_codes: ", e, sep = ""))

      return(NULL)

   }, finally = {

   })

   return(codes)

}

# Define get_state_names function.

get_state_names <- function(states){

   tryCatch({

      names <- paste(unlist(states$state_name), collapse = "|")

   }, warning = function(w) {

      print(paste("get_state_names: ", w, sep = ""))

      return(NULL)

   }, error = function(e) {

      print(paste("get_state_names: ", e, sep = ""))

      return(NULL)

   }, finally = {

   })

   return(names)

}

# Create states data frame.

# Create character vector containing US state codes.

codes <- c("AL", "AS", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI",
           "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI",
           "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC",
           "ND", "NH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT",
           "VT", "VA", "WA", "WV", "WI", "WY", "DC", "AS", "GU", "MP", "PR",
           "VI")

# Create character vector containing US state names; order of state codes and
# state names should be consistent, e.g. AL = Alabama, AS = Alaska, and so on.

names <- c("Alabama",
           "Alaska",
           "Arizona",
           "Arkansas",
           "California",
           "Colorado",
           "Connecticut",
           "Delaware",
           "Florida",
           "Georgia",
           "Hawaii",
           "Idaho",
           "Illinois",
           "Indiana",
           "Iowa",
           "Kansas",
           "Kentucky",
           "Louisiana",
           "Maine",
           "Maryland",
           "Massachusetts",
           "Michigan",
           "Minnesota",
           "Mississippi",
           "Missouri",
           "Montana",
           "Nebraska",
           "Nevada",
           "New Hampshire",
           "New Jersey",
           "New Mexico",
           "New York",
           "North Carolina",
           "North Dakota",
           "Ohio",
           "Oklahoma",
           "Oregon",
           "Pennsylvania",
           "Rhode Island",
           "South Carolina",
           "South Dakota",
           "Tennessee",
           "Texas",
           "Utah",
           "Vermont",
           "Virginia",
           "Washington",
           "West Virginia",
           "Wisconsin",
           "Wyoming",
           "District of Columbia",
           "American Samoa",
           "Guam",
           "Northern Mariana Islands",
           "Puerto Rico",
           "US Virgin Islands")

# Combine vectors in data frame.

states <- data.frame(codes, names, "United States", stringsAsFactors = FALSE)

# Add variable names to the data frame.

colnames(states) <- c("state_code", "state_name", "country_name")

# Get list of source files in the extdata directory.

files <- list.files("./inst/extdata", full.names = TRUE)

# Create empty data frame.

source <- data.frame()

# Load source files from extdata directory one at a time; bind each file
# to the source data frame.

for (f in 1:length(files)) {

      d <- readRDS(files[f])

      source <- rbind(source, d)

      # Uncomment the following line to track progress via console.

      print(paste(f, files[f], "done", sep = " "))

      rm(d)

}

# Create tweets data frame.

tweets <- source %>%
      filter(is_retweet == FALSE & lang == "en" & verified == FALSE) %>%
      select(status_id,
             user_id,
             created_at,
             text) %>%
      mutate(created_at_date = as.Date(ymd_hms(created_at)),
             created_at_weekday = wday(created_at),
             created_at_hour = hour(created_at),
             hashtags = get_hashtags(text)) %>%
      distinct(status_id, .keep_all = TRUE)

# Save the tweets data frame to the data directory.

usethis::use_data(tweets, overwrite = TRUE)

# Create tweet count data frame.

tweet_count <- tweets %>%
      group_by(user_id) %>%
      summarise(tweet_count = n())

# Create users data frame.

users <- source %>%
      filter(is_retweet == FALSE & lang == "en" & verified == FALSE) %>%
      select(user_id,
             screen_name,
             followers_count,
             friends_count,
             favourites_count,
             account_created_at,
             name,
             location,
             description) %>%
      mutate(location = gsub("^[^A-Za-z]+$", replacement = "", location),
             location = if_else(location == "", NA_character_, location),
             location = str_squish(location),
             ff_percentage = round((followers_count / (followers_count + friends_count)), 2),
             account_age_in_years = round(year(Sys.Date()) - year(account_created_at), 2),
             account_created_at_year = year(account_created_at),
             state_code = str_extract(location, get_state_codes(states)),
             state_name = str_extract(location, get_state_names(states)),
             state_code = if_else(is.na(state_code),
                                  states$state_code[match(state_name, states$state_name)],
                                  state_code),
             state_name = if_else(is.na(state_name),
                                  states$state_name[match(state_code, states$state_code)],
                                  state_name)) %>%
      distinct(user_id, .keep_all = TRUE)

# Join the tweet count and users data frames.

users <- right_join(users, tweet_count, by = "user_id")

# Save the users data frame to the data directory.

usethis::use_data(users, overwrite = TRUE)

dtminnick/whistleblower documentation built on Nov. 14, 2019, 2:45 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

dtminnick/whistleblower
Twitter Whistleblower Data Analysis

data-raw/datasets.R
In dtminnick/whistleblower: Twitter Whistleblower Data Analysis

R Package Documentation

Browse R Packages

We want your feedback!

dtminnick/whistleblower Twitter Whistleblower Data Analysis

data-raw/datasets.R In dtminnick/whistleblower: Twitter Whistleblower Data Analysis

R Package Documentation

Browse R Packages

We want your feedback!

dtminnick/whistleblower
Twitter Whistleblower Data Analysis

data-raw/datasets.R
In dtminnick/whistleblower: Twitter Whistleblower Data Analysis