R/handle.handles.R

#' @title Handle handles from a vector
#'
#' @description A function to either remove Handles (identified by @ symbol) or extract them (or both) from a vector. These often appear in social media data (Facebook posts, Instagram posts or tweets) and can be quite annoying. You might also want to extract them for downstream analysis of e.g. conversation dynamics.

#' @param text.clean vector containing one or more strings (i.e. length is equal to or greater than 1)
#' @param extract logical, indicating whether to extract the handles
#' @param remove logical, indicating whether to remove the handles
#' @return list where the first item is the cleaned text (handles are not removed if remove == F), the second item is a dataset of extracted handles
#' @export


handle.handles = function(text.clean,
                          extract = T,
                          remove = T){

  if(class(text.clean) != "character"){
    stop('class is not \'character\'')
  }

  mgsub = function(pattern, replacement, x) {
    if (length(pattern)!=length(replacement)) {
      stop("pattern and replacement do not have the same length.")
    }
    result <- x
    for (i in 1:length(pattern)) {
      result <- gsub(pattern[i], replacement[i], result, fixed = T)
    }
    result
  }

  out.handles = data.frame(matrix(nrow = 0, ncol = 2))
  colnames(out.handles) = c('position', 'handles')
  count = 1

  for (i in 1:length(text.clean)){

    if (i %% 10000 == 0){print(paste0(i, ' entries have been processed'))}

    # I have invested alot of time to find a non stringr solution for extracting the @ symbols... before I just did:
      # handles = unlist(stringr::str_extract_all(text.clean[i], "@\\w+")) # but stringr has lots of issues with computers...

    handles = unlist(stringi::stri_extract_all_boundaries(text.clean[i]))
    handles = handles[which(grepl(x = handles,
                                  pattern = "@\\w+"))]
    handles = base::trimws(handles)

    if(length(handles) > 0){

      if(extract == T){
      out.handles[count,1] = i
      out.handles[count,2] = paste0(handles, collapse = ', ')
      count = count +1
      }

      if(remove == T){
        text.clean[i] = mgsub(pattern = handles, # the mgsub function is made within this package
                              replacement = rep('', length(handles)),
                              x = text.clean[i])}
      text.clean[i] = trimws(text.clean[i])

      # remove white spaces
      text.clean[i] = base::gsub(x =  text.clean[i],
                          pattern = "\\s+",
                          replacement = " ")
      }
  }

  out.results = list(text.clean)
  if(extract == TRUE){
    out.results[[2]] = out.handles}
  return(out.results)
}
bvidgen/tc documentation built on May 9, 2019, 2:21 a.m.