R/text.R

Defines functions recursive_split emailRegex readCharAll search_df

#' Search DataFrame
#' @description Non case-sensitive search of the data frame texts
#' @param text
#' @param df
#'
#' @return rows of data frame that match the pattern is returned
#' @export
#'
#' @examples
#' df1 = as.data.frame(data()$results)
#' search_df('nile',df=df1)
search_df <- function(pattern='nile',df){
  #fixed = T and ignore.case=T are not working together.
  gg <- grep(pattern,as.matrix(df) ,ignore.case = T  ,value = F)
  idx <- unique(gg %% nrow(df))
  df <- df[idx,,drop=F]
  rownames(df) <- idx
  return(df)
}

#' Read all character
#' @description It reads the whole text file into a character string variable
#' @param fName file name to be read
#'
#' @return character string that represents the text file
#' @export
#'
#' @examples
#' readCharAll("D:\\My.txt")
readCharAll <- function(fName){
  #readLines()
  cont <- readChar(fName,file.info(fName)$size)
  return(cont)
}


#' Email Regex
#' @description It uses regular expression to remove <, >. Replaces numeric values
#' with "number",replace http address with "httpaddr", replace email addresses
#' with emailaddr, replace $ with dollar
#' @param email_contents character string usually represents email content
#'
#' @return new string with replaced contents
#' @export
#'
#' @examples
#' emailRegex("[email protected] http://google.com <html tag > Some text other")
emailRegex <- function(email_contents){

  # Strip all HTML
  # Looks for any expression that starts with < and ends with > and
  # and does not have any < or > inside the tag and replaces it with a space
  email_contents <- gsub('<[^<>]+>', ' ',email_contents)

  # Handle Numbers
  # Look for one or more characters between 0-9
  email_contents <- gsub('[0-9]+', 'number',email_contents)

  # Handle URLS
  # Look for strings starting with http:// or https://
  email_contents <-
    gsub('(http|https)://[^\\s]*', 'httpaddr',email_contents ,perl = TRUE)

  # Handle Email Addresses
  # Look for strings with @ in the middle
  email_contents <-
    gsub('[^\\s][email protected][^\\s]+', 'emailaddr',email_contents,perl = TRUE)

  # Handle $ sign
  email_contents <- gsub('[$]+', 'dollar',email_contents)
  return(email_contents)
}

#' Recursive String Tokenizer
#' @description Tokenizes string at each of these punctuation characters
#' splitters character and newline and tab. It also removes
#' any empty and non alpha-numeric token.
#' @param x a character vector
#' @param ii for internal use
#'
#' @return a character vector of all splitted characters
#' @export
#'
#' @examples
#' recursive_split("This is a test%string$to[tokenize].")
recursive_split <- function(x, splitters = " @$#&?!.,;:*+/-_[](){}><'\"\n\r\t",ii=1){
  #caveats: when the text contains numbers, exclude . -  from splitters
  #the last 3 chars are escaped: ", enter, tab
  #cat(splitters)
  if (is.null(x) || length(x)==0)
    return (NULL) #list() , chr(), NULL

  if(nchar(splitters)==ii)
    return(unlist(x))
  ss <- strsplit(unlist(x), substr(splitters,ii,ii),fixed = TRUE)
  return(Recall(ss,splitters,ii=ii+1))
}
faridcher/futils documentation built on Oct. 20, 2017, 9:52 a.m.