#' @title Split up hashtags
#'
#' @description A function to split up hashtags - but only in cases where the words are separated with Capitalizations. Hashtags often contain important semantic information - which we want to extract rather than leaving them as (sparsely occurring) strings of several words combined without spaces.
#' @param x vector containing one or more strings (i.e. length is equal to or greater than 1)
#' @return vector with hashtags separated
#' @export
split.hashtags = function(x){
if(class(x) != "character"){
stop('class is not \'character\'')}
mgsub = function(pattern, replacement, x) {
if (length(pattern) != length(replacement)) {
stop("pattern and replacement do not have the same length.")
}
result <- x
for (i in 1:length(pattern)) {
result <- gsub(pattern[i], replacement[i], result, fixed = T)
}
return(result)
}
for (i in 1:length(x)){
if (i %% 10000 == 0){print(paste0(i, ' entries have been processed'))}
# as with handle.handles i have spent alot of time to do this without using stringr...
hashtags.original = unlist(stringi::stri_extract_all_boundaries(x[i]))
hashtags.original = hashtags.original[which(grepl(x = hashtags.original,
pattern = "#\\S+"))]
hashtags.original = base::trimws(hashtags.original)
if (length(hashtags.original) > 0){
hashtags = hashtags.original
hashtags = base::gsub('[[:digit:]]+', '', hashtags) # https://stackoverflow.com/questions/13590139/remove-numbers-from-alphanumeric-characters
# Remove all punctuation (including the # symbol)
hashtags = base::gsub('[[:punct:] ]+', '',hashtags)
# don't worry about capitalised acronyms - just separate them out; because we have functions later to handle these
hashtag.change = data.frame(
hashtags.original,
base::gsub('([[:upper:]])', ' \\1', hashtags, perl = T) # separates out spaces, based on where the capitalizations fall - https://stackoverflow.com/questions/7988959/splitting-string-based-on-letters-case
)
x[[i]] = mgsub(hashtag.change[,1],
hashtag.change[,2],
x = x[[i]])
# remove white spaces
x[[i]] = base::gsub(x = x[[i]],
pattern = "\\s+",
replacement = " ")
x[[i]] = base::trimws(x[[i]])
}}
return(x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.