R/split.hashtags.R

#' @title Split up hashtags
#'
#' @description A function to split up hashtags - but only in cases where the words are separated with Capitalizations. Hashtags often contain important semantic information - which we want to extract rather than leaving them as (sparsely occurring) strings of several words combined without spaces.

#' @param x vector containing one or more strings (i.e. length is equal to or greater than 1)
#' @return vector with hashtags separated
#' @export


split.hashtags = function(x){

  if(class(x) != "character"){
    stop('class is not \'character\'')}

  mgsub = function(pattern, replacement, x) {
    if (length(pattern) != length(replacement)) {
      stop("pattern and replacement do not have the same length.")
    }

    result <- x
    for (i in 1:length(pattern)) {
      result <- gsub(pattern[i], replacement[i], result, fixed = T)
    }
    return(result)
  }


  for (i in 1:length(x)){
    if (i %% 10000 == 0){print(paste0(i, ' entries have been processed'))}

    # as with handle.handles i have spent alot of time to do this without using stringr...
    hashtags.original = unlist(stringi::stri_extract_all_boundaries(x[i]))
    hashtags.original = hashtags.original[which(grepl(x = hashtags.original,
                                  pattern = "#\\S+"))]
    hashtags.original = base::trimws(hashtags.original)


    if (length(hashtags.original) > 0){

      hashtags = hashtags.original
      hashtags = base::gsub('[[:digit:]]+', '', hashtags) # https://stackoverflow.com/questions/13590139/remove-numbers-from-alphanumeric-characters

      # Remove all punctuation (including the # symbol)
      hashtags = base::gsub('[[:punct:] ]+', '',hashtags)

      # don't worry about capitalised acronyms - just separate them out; because we have functions later to handle these
      hashtag.change =  data.frame(
        hashtags.original,
        base::gsub('([[:upper:]])', ' \\1', hashtags, perl = T) # separates out spaces, based on where the capitalizations fall - https://stackoverflow.com/questions/7988959/splitting-string-based-on-letters-case
      )

      x[[i]] = mgsub(hashtag.change[,1],
                              hashtag.change[,2],
                              x = x[[i]])

      # remove white spaces
      x[[i]] = base::gsub(x = x[[i]],
                        pattern = "\\s+",
                        replacement = " ")
      x[[i]] = base::trimws(x[[i]])
    }}
  return(x)
}
bvidgen/tc documentation built on May 9, 2019, 2:21 a.m.