R/cleanData.R

Defines functions cleanData

Documented in cleanData

#' @title Clean web click data
#' @name cleanData
#' @description This function takes a link click dataset and removes extraneous clicks directed to ads, ip addresses, etc.
#'
#' @param data A dataframe pulled from the webclick data
#' @param matchvector a vector of strings used to identify topics of link clicks
#' 
#' 
#' @importFrom magrittr %>% 
#' @importFrom dplyr filter
#' @importFrom dplyr mutate
#' @importFrom lubridate duration
#' @importFrom lubridate ymd_hms
#' 
#' 
#' @return datanoads a dataframe with ads removed and prepared for analysis
#' @export
#'
#' 


cleanData <- function(data, matchvector) {
  
  datanoads <- data %>%
    dplyr::filter("AUTHORITY_URI" != "t.co") %>% 
    dplyr::filter("AUTHORITY_URI" != "jamdex.com") %>%  
    dplyr::filter("AUTHORITY_URI" != "awesome-cool-music.blogspot.com") %>% 
    dplyr::filter("AUTHORITY_URI" != "greatmusicstreaming.blogspot.com") %>% 
    dplyr::filter("AUTHORITY_URI" != "n2adshostnet.com") %>% 
    dplyr::filter("AUTHORITY_URI" != "prpops.com") %>% 
    dplyr::filter("AUTHORITY_URI" != "best-streaming-music.blogspot.com") %>% 
    dplyr::filter("AUTHORITY_URI" != "www.dropbox.com") %>% 
    dplyr::filter("AUTHORITY_URI" != "static-v2.astar.mobi") %>% 
    dplyr::filter("AUTHORITY_URI" != "dl.dropboxusercontent.com") %>% 
    dplyr::filter("AUTHORITY_URI" != "interactive.tegna-media.com") %>% 
    dplyr::filter(!"AUTHORITY_URI" %in% grep(pattern = "adf.ly", x = data$AUTHORITY_URI, value = T)) %>% 
    dplyr::filter(!"AUTHORITY_URI" %in% grep(pattern = "getgiftcards.org", x = data$AUTHORITY_URI, value = T)) %>% 
    dplyr::filter(!"AUTHORITY_URI" %in% grep(pattern = "weebly.com", x = data$AUTHORITY_URI, value = T)) %>% 
    dplyr::filter(!"AUTHORITY_URI" %in% grep(pattern = "$<U.*", x = data$AUTHORITY_URI, value = T)) %>% 
    dplyr::filter(!"AUTHORITY_URI" %in% grep(pattern = "myautodj.com", x = data$AUTHORITY_URI, value = T)) %>% 
    dplyr::filter(!"AUTHORITY_URI" %in% grep(pattern = "rackcdn.com", x = data$AUTHORITY_URI, value = T)) %>% 
    dplyr::filter(!"AUTHORITY_URI" %in% grep(pattern = "voluumtrk.com", x = data$AUTHORITY_URI, value = T)) %>% 
    dplyr::filter(!"AUTHORITY_URI" %in% grep(pattern = "blogspot.com", x = data$AUTHORITY_URI, value = T)) %>% 
    dplyr::filter(!"AUTHORITY_URI" %in% grep(pattern = "api.", x = data$AUTHORITY_URI, value = T)) %>% 
    dplyr::filter(!"AUTHORITY_URI" %in% grep(pattern = "click.", x = data$AUTHORITY_URI, value = T)) %>% 
    dplyr::filter(!"AUTHORITY_URI" %in% grep(pattern = "ad.doubleclick", x = data$AUTHORITY_URI, value = T)) %>% 
    dplyr::filter(!"AUTHORITY_URI" %in% grep(pattern = "yakidee.org", x = data$AUTHORITY_URI, value = T)) %>% 
    dplyr::filter(!"AUTHORITY_URI" %in% grep(pattern = "app.", x = data$AUTHORITY_URI, value = T)) %>% 
    dplyr::filter(!"AUTHORITY_URI" %in% grep(pattern = "[[:digit:]]{,3}[[:punct:]]{,1}[[:digit:]]{,3}[[:punct:]]{,1}[[:digit:]]{,3}[[:punct:]]{,1}[[:digit:]]{,3}", x = data$AUTHORITY_URI, value = T)) %>% 
    dplyr::filter(!"AUTHORITY_URI_REFERRER" %in% grep(pattern =  "[[:digit:]]{,3}[[:punct:]]{,1}[[:digit:]]{,3}[[:punct:]]{,1}[[:digit:]]{,3}[[:punct:]]{,1}[[:digit:]]{,3}", x = data$AUTHORITY_URI_REFERRER, value = T)) %>% 
    dplyr::filter(!"AUTHORITY_URI_REFERRER" %in% c("video-promo.net","aptrk.com","www.IsraeLIVE.org","watchonlinevideos.org"))
  
  

  datanoads <- datanoads %>%
    dplyr::mutate(referrer = ifelse(datanoads$AUTHORITY_URI_REFERRER %in% grep(pattern = "facebook", x = datanoads$AUTHORITY_URI_REFERRER, value = T), "Facebook",
                             ifelse(datanoads$AUTHORITY_URI_REFERRER %in% grep(pattern = "t.co", x = datanoads$AUTHORITY_URI_REFERRER, value = T), "Twitter",
                                    ifelse(datanoads$AUTHORITY_URI_REFERRER %in% grep(pattern = "Direct", x = datanoads$AUTHORITY_URI_REFERRER, value = T), "direct",
                                           ifelse(is.na(datanoads$AUTHORITY_URI_REFERRER), "Unknown","Website"))))) %>%

  dplyr::mutate(topicrelated = ifelse(datanoads$URL_REQUEST %in% grep(pattern = paste(matchvector, collapse = "|"), value = T, x = datanoads$URL_REQUEST), "Yes", "No"))
  
  
  datanoads <- datanoads %>% dplyr::as_tibble()
  datanoads$referrer <- datanoads$referrer %>% as.factor()
  datanoads$VENDORNAME_OPERATINGSYSTEM <- datanoads$VENDORNAME_OPERATINGSYSTEM %>% as.factor()
  datanoads$TYPE_HARDWAREPLATFORM <- datanoads$TYPE_HARDWAREPLATFORM %>% as.factor()
  datanoads$TIMESTAMP_INIT <- as.POSIXct(datanoads$TIMESTAMP_INIT) %>% lubridate::ymd_hms()
  datanoads$DURATION_FROMCLICKTOCREATION <- datanoads$DURATION_FROMCLICKTOCREATION %>% as.numeric
  datanoads$DURATION_FROMCLICKTOCREATION <-(datanoads$DURATION_FROMCLICKTOCREATION)/3600

  return(datanoads)  
  
  
}
gallagherj2008/SWAT documentation built on May 28, 2019, 12:59 p.m.