tc: Text Cleaning

#' @title Combine single letter words
#'
#' @description A function to combine words in which the letters have been separated with spaces to avoid content moderation. It was inspired by Hosseini et al.'s paper on deceiving the Perspective API, available here: https://arxiv.org/pdf/1702.08138.pdf

#' @param pattern vector containing one or more regular expressions (or character string for fixed = TRUE) to be matched in the given character vectors.
#' @param replacement vector containing one or more replacements for matched pattern. Must be the same length as pattern.
#' @param x a character vector where matches are sought, or an object which can be coerced by as.character to a character vector.
#' @return vector with substituted values
#' @export

slw = function(temp){
  if(class(temp) != "character"){
    stop('class is not \'character\'')
  }
  temp.original = unname(temp)
  temp = tolower(temp)
  temp = gsub(x = temp,
                    pattern = "\\s+",
                    replacement = " ")
  temp

  # convert the letters and numbers and punctuation to 'a' values
  temp.a = gsub(pattern = "[[:punct:]A-z0-9]", # http://www.endmemo.com/program/R/gsub.php - should be all punctuation, letters and numbers
                replacement = 'a', # A-z0-9
                x = temp)
  temp.a
  temp.a = paste0(temp.a, ' aaaa') # add this in because the code has an issue with the final value -> and it won't mess up the indexing. Before I was adding + 1 to the final item to be manipulated, but that as messing up other code

  # index where the spaces occur in the string
  index.slw = gregexpr(pattern = ' ',
                       text = temp.a)
  index.slw = unlist(index.slw)
  index.slw # these are the locations of every space, i.e. between every word/string of letters

  # what is the size of the spaces between the values? # 2 indicates a single letter, anything more indicates a legit string
  diff.slw = diff(index.slw)
  diff.slw

  # which of the spaces are just of size 2?
  which.2 = which(unlist(gregexpr(pattern = '^2$',
                                  text = diff.slw)) == 1)
  if(length(which.2) == 0){
    print('no floating letters have been found! Returning lowered text')
    return(temp)
  }
  which.2

  # which of the entries are consecutive? (i.e. we know there is a problem)
  which.2.consecutive = (which(diff(which.2) == 1) + 1) # add one because it tells us which are consecutive from the one before - i.e. this way the values line up
  which.2.consecutive
  if(length(which.2.consecutive) == 0){
    print('no strings of floating letters have been found! Floating letters might be legit. Returning lowered text')
    return(temp)
  }

  # where do the breaks fall to indicate a new 'run' of single letters?
  which.2.new = (which(diff(which.2.consecutive) != 1) + 1)  # add one, again because we want to know where the consecutive value ends not 'lands'
  which.2.new = unique(c(1, which.2.new)) # 1 is not necessarily included - we know that these are all consecutive single letters, so it is fine to manually add this in
  which.2.new

  # get the ranges for each of the 'runs' of consecutive 2s
  list.ranges = list()
  for (i in 1:length(which.2.new)){
    #print(i)

    if(i != length(which.2.new)){
      list.ranges[[i]] = (which.2.new[i]:(which.2.new[i+1]-1))

    } else {
      list.ranges[[i]] = which.2.new[i]:length(which.2.consecutive)
    }
    # print(list.ranges[i])
  }
  list.ranges #  the runs of consecutive 2s split up into separate items in a list - pretty simple!

  # now actually work out which values to swap out
  save.combine = list()
  for(j in 1:length(list.ranges)){
    #print(j)
    #if(length(list.ranges[[i]] > 1)){
    temp.range = c(min(which.2.consecutive[list.ranges[[j]]]) - 1,
                   which.2.consecutive[list.ranges[[j]]]) # add back on one below the minimum value
    temp.range
    temp.range = c(which.2[temp.range],
                   max(which.2[temp.range]) + 1) # add back on one above the max value (to account for the diff calculations)
    temp.range

    save.combine[[j]] = index.slw[temp.range]
    #} else {

    #}


    rm(temp.range)}
  save.combine

  # Now, for the save.combine values, get the identified single letter words, and the equivalent squashed together text
  list.in = list()
  list.out = list()
  for (k in 1:length(save.combine)){
    #print(k)
    #print(save.combine[k])
    list.in[k] = substr(x = temp, # we now use the original temp object  rather than temp.a -> with temp we have the position of letters and now we actually swap them round
                        start = min(range(save.combine[k])),
                        stop = max(range(save.combine[k])))
    list.out[k] = paste0(' ',
                         trimws(gsub(pattern = ' ',
                                     replacement = '',
                                     x = list.in[k])),
                         ' ') }
  list.in; list.out

  # Swap out the values
  for(i in 1:length(list.in)){
    #print(i)
    #print(list.out[[i]])
    #print(list.in[[i]])
    temp = gsub(pattern = list.in[[i]],
                replacement = list.out[[i]],
                x = temp)
  }
  temp = trimws(temp)
  temp = unname(temp)
  return(temp)
}