knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", out.width = "100%", cache = TRUE ) options(width = 90, digits = 5, scipen = 6, datatable.print.topn = 5, datatable.print.nrows = 50) library(tweetbotornot2) version <- as.vector(read.dcf('DESCRIPTION')[, 'Version']) version <- gsub('-', '.', version) tml <- rtweet::get_timeline("kearneymw", n = 100) status_id <- tml$status_id[grep("-tweetit", tml$text, ignore.case = TRUE)[1]]
{tweetbotornot2}
provides an out-of-the-box classifier for detecting Twitter
bots that is easy to use, interpretable,
scalable, and performant. It also provides a
convenient interface for accessing the botometer
API.
Install the development version of {tweetbotornot2}
from Github with:
## install {remotes} if not already if (!"remotes" %in% installed.packages()) { install.packages("remotes") } ## install from github remotes::install_github("mkearney/tweetbotornot2")
predict_bot()
to run the built-in bot classifierProvide a vector or data frame of Twitter handles and predict_bot()
will
return the estimated probability of each account being a bot.
## vector of screen names screen_names <- c( "American__Voter", ## (these ones should be bots) "MagicRealismBot", "netflix_bot", "mitchhedbot", "rstats4ds", "thinkpiecebot", "tidyversetweets", "newstarsbot", "CRANberriesFeed", "AOC", ## (these ones should NOT be bots) "realDonaldTrump", "NateSilver538", "ChadPergram", "kumailn", "mindykaling", "hspter", "rdpeng", "kearneymw", "dfreelon", "AmeliaMN", "winston_chang" ) ## data frame with screen names **must be named 'screen_name'** screen_names_df <- data.frame(screen_name = screen_names) ## vector -> bot estimates predict_bot(screen_names) ## data.frame -> bot estimates #predict_bot(screen_names_df)
This also works on Twitter user IDs.
## vector of user IDs (strings of numbers, ranging from 2-19 digits) user_ids <- rtweet::lookup_users(screen_names)[["user_id"]] ## data frame with user IDs **must be named 'user_id'** user_ids_df <- data.frame(user_id = users) ## vector -> bot estimates predict_bot(user_ids) ## data.frame -> bot estimates predict_bot(user_ids_df)
The input given to predict_bot()
can also be Twitter data returned by
{rtweet}, i.e., rtweet::get_timelines()
1.
## timeline data returned by {rtweet} twtdat <- rtweet::get_timelines(screen_names, n = 200, check = FALSE) ## generate predictions from twitter data frame predict_bot(twtdat)
explain_bot()
to see the contributions made by each featureView prediction contributions from top five features (for each user) in the model
## view top feature contributions in prediction for each user explain_bot(twtdat)[ order(screen_name, -abs(value)), ][ feature %in% feature[1:5], .SD, on = "feature" ][1:50, -1]
If you have already collected user timeline data, predict_bot()
has no rate
limit. If you don't already have timeline data, then predict_bot()
relies on
calls to Twitter's users/timeline
API, which is rate limited to 1,500 calls
per 15 minutes (for bearer tokens) or 900 calls per 15 minutes (for user tokens).
Fortunately, each prediction requires only one call to Twitter's API, so it's
possible to get up to 6,000 predictions per hour or 144,000 predictions per
day2.
## view bearer token rate limit for users/timeline endpoint rtweet::rate_limit(rtweet::bearer_token(), "get_timeline") ## view user token rate limit for users/timeline endpoint rtweet::rate_limit(rtweet::get_token(), "get_timeline")
The most influential features in the classifier
# idsbot <- c("user_id", "screen_name", "bot") # mod <- tweetbotornot2:::prep_xgb_model() # mod$feature_names <- tweetbotornot2:::tweetbotornot_xgb_model_feature_names # xgboost::xgb.ggplot.importance(xgboost::xgb.importance( # model = mod, # trees = seq_len(tweetbotornot2:::tweetbotornot_xgb_model_best_ntreelimit)), # measure = "Gain", top_n = 20) + # ggplot2::scale_fill_viridis_d(begin= 0.05, end = 0.9) + # dataviz::theme_mwk(14) + # #ggplot2::coord_cartesian(ylim = c(0, 0.3)) + # #ggplot2::coord_flip(ylim = c(0, .12)) + # ggplot2::ggsave("man/figures/README-import.png", width = 9, height = 8, # units = "in", dpi = 312)
How features contributed to predictions in the original training data:
# .d <- tfse::read_RDS("../twbt/data-final-munged.rds") # mod <- tweetbotornot2:::prep_xgb_model() # mod$feature_names <- tweetbotornot2:::tweetbotornot_xgb_model_feature_names # # png("man/figures/README-shap.png", width = 9, height = 8, units = "in", res = 312) # par(tcl = -0.175, bty = "n", xaxt = "s", yaxt = "s", col = "#aaaaaa") # cols <- viridis::viridis_pal(begin = 0.1, end = 0.9)(2) # suppressWarnings( # xgboost::xgb.plot.shap( # data = as.matrix(.d[, -(1:3)]), # trees = seq_len(mod$best_ntreelimit), # model = mod, # top_n = 36, n_col = 6, # discrete_jitter = 0.15, span_loess = 0.35, col_loess = cols[1], # col = cols[2], # family = "Avenir Next LT Pro", # cex.lab = 1.5, # ylab = NA # ) # ) # dev.off()
predict_botometer()
to access Botometer's API## get botometer scores predict_botometer(c('kearneymw', 'netflix_bot'))
Accuracy of tweetbotornot versus botometer across multiple datasets:
1 The built-in classifier was trained using [up to] the most recent 200
tweets from each user. That means all tweets older than the 200th tweet will be
filtered out (ignored). It also means that estimates made on fewer than the most
recent 200 tweets are unreliable–except in cases where a user doesn't HAVE up to
200 eligible tweets. In other words, the classifier should work as expected if
data are gathered via {rtweet}'s get_timeline()
function with the n
argument
set equal to or greater than 200, i.e. rtweet::get_timelines(users, n = 200)
.
2 This is in contrast to botometer, which recently increased its rate limit to 2,000 calls per day (up from 1,000 calls per day).
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.