R/remove.duplicates.R

#' Remove duplicated strings from a vector
#'
#' A function to remove duplicated strings from a vector. In cases where the vector contains e.g. 100 tweets, after cleaning you may have made many duplicates (e.g. two tweets with the same content but different URLs become the same after cleaning). You may want to remove duplicates, depending upon your downstream tasks.

#' @param text.clean vector containing one or more strings (i.e. length is equal to or greater than 1)
#' @return vector x with duplicated strings removed
#' @export

remove.duplicates = function(text.clean){

  if(class(text.clean) != "character"){
    stop('class is not \'character\'')
  }

  dup.tweet = base::which(duplicated(text.clean))
  print(paste0('there are this many duplicated tweets: ', length(dup.tweet)))

  if (length(dup.tweet) >0){
  text.clean = text.clean[-dup.tweet] }

  dup.tweet = base::which(duplicated(text.clean))
  print(paste0('there are now this many: ', length(dup.tweet)))

  return(text.clean)
}
bvidgen/tc documentation built on May 9, 2019, 2:21 a.m.