####################################
#### Testing mTurk Bot Function ####
####################################
# Comes from: https://github.com/SICLab/detecting-bots
# This is an example of how to use this function. This function is designed to identify low-quality mTurk responses.
# This function assigns a score to each response. The higher the score, the higher the probability that the respondant is a bot or survey-farmer.
# It is best to examine each response with a high score manually. This function cannot replace the human eye- it can only guide it.
# Function arguments:
# Data - your dataset
# Latitude - A column with latitude coordinates for your respondant.
# Longitude - A column with longitude coordinates for your respondant.
# Time - An optional column with Qualtrics-formatted date and time stamps.
# Comments - An optional free-response field.
# Comments2 - A second, optional free-response field.
# Comments3 - A third, optional free-response field.
# Scoring:
# Scores can go as high as 7 if you have three free-resposne fields.
# Having a latitude and longitude that appears in more than 1% of responses adds 1 point. (I recommend changing the percentage depending on the size of your dataset.)
# Having a duplicate latitude and longitude, AND responding within 10 minutes of the other responses from the same latitude and longitude adds 1 point. (I recommend StartedDate.)
# Comments consisting solely of phrases typically attributed to bots/duplicate responses/survey farmers adds 1 point. (Send new suggestions for phrases to jprims2@uic.edu.)
# Duplicate comments that other respondants have already made in response to the same question add 1 point.
# Max score for only latitude and longitude: 1
# Max score for latitude, longitude, and time: 2
# Max score for latitude, longitude, and one free-response: 3
# Max score for latitude, longitude, time, and one free-response: 4
# Max score for latitude, longitude, and two free-responses: 5
# Max score for latitude, longitude, time, and two free-responses: 6
# Max score for latitude, longitude, and three free-responses: 7
# Max score for latitude, longitude, time, and three free-responses: 8
# Loading in the function
bot.detector <- function(Latitude, Longitude, Time, Threshold = .01, Comments, Comments2, Comments3){
# This loads in required packages. (Mostly for the Time argument.)
require(tidyr)
require(dplyr)
require(zoo)
# This creates a new column to store our bot suspicion score.
bot.susp <- rep(NA, length(Latitude))
# First, let's work on detecting if there are some coordinates that appear in more than 1% of the a.
# With Qualtrics, the columns we want to look at are Latitude and Longitude.
# Creating an object combining those two into one column
latlong <- paste(Latitude,Longitude)
# This counts the number of times each coordinate appears in the aset.
llcount <- summary(as.factor(latlong))
# This determines if a certain latitude and longitude appears in more than 1% of responses.
lllots <- llcount > length(Latitude) * Threshold # You can change the .01 to change the % of the sample.
# Pulls out the coordinates that make up more than 1% of the sample.
llmany <- names(lllots[lllots == TRUE])
# Adds a 1 to the bot suspicion column if the coordinates appear in more than 1% of the sample
bot.susp <- ifelse(latlong %in% llmany, 1, 0)
# Now, let's check if their free response contains "good" or "NICE!"
suswords <- c("good","NICE!")
# Transform vector of phrases to lowercase
suswords <- tolower(suswords) # See https://www.maxhuibai.com/blog/evidence-that-responses-from-repeating-gps-are-random for illustration
# Check if person specified a column of times. If so, run.
if(missing(Time)) {
NULL
} else {
# First, converting time to a format R can use. Using the typical Qualtrics organization.
Time <- as.POSIXct(Time, tz = "", format = "%m/%d/%Y %H:%M", optional = FALSE)
Time <- as.numeric(Time)
# I'd like to make a dataframe so I can filter things.
tempdat <- data.frame(latlong, Time)
# Now, adding an ID
tempdat$id <- 1:(nrow(tempdat))
# This filters it so the dataframe only keeps rows with suspicious coordinates, and moves it to long format.
tempdatw <- spread(subset(tempdat, tempdat$latlong %in% llmany), latlong, Time)
# Fill in NAs with 0s
tempdatw[is.na(tempdatw)] <- 0
# Check if time difference between a duplicate and the previous duplicate response is between 1 and 600 seconds (10 minutes)
# Code for 1 duplicate and more duplicates
ifelse(ncol(tempdatw) == 2,
# If one repeating coordinate
ifelse(abs(tempdatw[,2] - lag(tempdatw[,2], n = 1L)) < 600 & abs(tempdatw[,2] - lag(tempdatw[,2], n = 1L)) > 1, TRUE, FALSE),
# If multiple coordinates
tempdatw[,-1] <- lapply(tempdatw[,-1], function(x) ifelse(abs(x - lag(x, n = 1L)) < 600 & abs(x - lag(x, n = 1L)) > 1, TRUE, FALSE))
)
# I think I need to sum the two columns into one.
tempdatw$sum <- rowSums(tempdatw[,-1])
# Putting it back in long format, so I can merge it back in with our temporary data frame
# TEMPDAT L IS LISTING SOME IDS TWICE
tempdatl <- tempdatw[,c("id","sum")]
# Merge back in to tempdat
findat <- merge(tempdat, tempdatl[,c("id","sum")], by = "id", all.x = TRUE)
findat$sum <- ifelse(is.na(findat$sum), 0, findat$sum)
# Now, let's add that suspicion!
bot.susp <- ifelse(findat$sum >= 1, bot.susp + 1, bot.susp) # For some reason, it's not adding.
}
# Check if person specified a free-response. If so, run.
if(missing(Comments)) {
NULL
} else {
# Adds 1 to the bot suspicion column if suspicous phrases appear in the responses.
# Transform comment vectors to lowercase
Comments <- tolower(Comments)
# Putting the arguments in this order makes sure it won't flag comments that contain the word "good," but also have other content.
bot.susp <- ifelse(Comments %in% suswords, bot.susp + 1, bot.susp)
# Now, check if any free responses are 100% matches to other free responses.
bot.susp <- ifelse(duplicated(Comments), bot.susp + 1, bot.susp)
}
# Check if person specified second free-response. If so, run.
if(missing(Comments2)) {
NULL
} else {
# Transform comment vectors to lowercase
Comments2 <- tolower(Comments2)
# Adds 1 to the bot suspicion column if suspicous phrases appear in the responses.
# Putting the arguments in this order makes sure it won't flag comments that contain the word "good," but also have other content.
bot.susp <- ifelse(Comments2 %in% suswords, bot.susp + 1, bot.susp)
# Now, check if any free responses are 100% matches to other free responses.
bot.susp <- ifelse(duplicated(Comments2), bot.susp + 1, bot.susp)
}
# Check if person specified third free-response. If so, run.
if(missing(Comments3)) {
NULL
} else {
# Transform comment vectors to lowercase
Comments3 <- tolower(Comments3)
# Adds 1 to the bot suspicion column if suspicous phrases appear in the responses.
# Putting the arguments in this order makes sure it won't flag comments that contain the word "good," but also have other content.
bot.susp <- ifelse(Comments3 %in% suswords, bot.susp + 1, bot.susp)
# Now, check if any free responses are 100% matches to other free responses.
bot.susp <- ifelse(duplicated(Comments3), bot.susp + 1, bot.susp)
}
# Outputting results
return(bot.susp)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.