Nothing
#' qdapRegex: Regular Expression Removal, Extraction, & Replacement Tools for the \pkg{qdap} Package
#'
#' \pkg{qdapRegex} is a collection of regular expression tools associated with
#' the \pkg{qdap} package that may be useful outside of the context of discourse
#' analysis. Tools include removal/extraction/replacement of abbreviations,
#' dates, dollar amounts, email addresses, hash tags, numbers, percentages,
#' citations, person tags, phone numbers, times, and zip codes.
#'
#' The \pkg{qdapRegex} package does not aim to compete with string manipulation
#' packages such as
#' \href{https://CRAN.R-project.org/package=stringr}{\pkg{stringr}}
#' or \href{https://CRAN.R-project.org/package=stringi}{\pkg{stringi}}
#' but is meant to provide access to canned, common regular expression patterns
#' that can be used within \pkg{qdapRegex}, with \pkg{R}'s own regular
#' expression functions, or add on string manipulation packages such as
#' \code{stringr} and \code{stringi}.
#'
#' @docType package
#' @name qdapRegex
#' @aliases qdapRegex package-qdapRegex
NULL
#'
#' A dataset containing the regex chunk name, the regex string, and a
#' description of what the chunk does.
#'
#' @details
#' \itemize{
#' \item Name. The name of the regex chunk.
#' \item Regex. The regex chunk.
#' \item What it Does. Description of what the regex chunk does.
#' }
#'
#' @docType data
#' @name regex_cheat
#' @usage data(regex_cheat)
#' @format A data frame with 6 rows and 3 variables
NULL
#' Canned Regular Expressions (United States of America)
#'
#' A dataset containing a list U.S. specific, canned regular expressions for use
#' in various functions within the \pkg{qdapRegex} package.
#'
#' @docType data
#' @details The following canned regular expressions are included:
#' \describe{
#' \item{rm_abbreviation}{abbreviations containing single lower case or capital letter followed by a period and then an optional space (this must be repeated 2 or more times)}
#' \item{rm_between}{Remove characters between a left and right boundary including the boundaries; note contains \code{"\%s"} that is replaced by \code{\link[base]{sprintf}} and is not a valid regex on its own}
#' \item{rm_between2}{Remove characters between a left and right boundary NOT including the boundaries; note contains \code{"\%s"} that is replaced by \code{\link[base]{sprintf}} and is not a valid regex on its own}
#' \item{rm_caps}{words containing 2 or more consecutive upper case letters and no lower case}
#' \item{rm_caps_phrase}{phrases of 1 word or more containing 1 or more consecutive upper case letters and no lower case; if phrase is one word long then phrase must be 2 or more consecutive capital letters}
#' \item{rm_citation}{substring that looks for in-text and parenthetical APA6 style citations (attempts to exclude references)}
#' \item{rm_citation2}{substring that looks for in-text APA6 style citations (attempts to exclude references)}
#' \item{rm_citation3}{substring that looks for parenthetical APA6 style citations (attempts to exclude references)}
#' \item{rm_city_state}{substring with \emph{city} (single lower case word or multiple consecutive capitalized words before a comma and state) & \emph{state} (2 consecutive capital letters)}
#' \item{rm_city_state_zip}{substring with \emph{city} (single lower case word or multiple consecutive capitalized words before a comma and state) & \emph{state} (2 consecutive capital letters) & \emph{zip code} (exactly 5 or 5+4 consecutive digits)}
#' \item{rm_date}{dates in the form of 2 digit month, 2 digit day, and 2 or 4 digit year. Separator between month, day, and year may be dot (.), slash (/), or dash (-)}
#' \item{rm_date2}{dates in the form of 3-9 letters followed by one or more spaces, 2 digits, a comma(,), one or more spaces, and 4 digits}
#' \item{rm_date3}{dates in the form of XXXX-XX-XX; hyphen separated string of 4 digit year, 2 digit month, and 2 digit day}
#' \item{rm_date4}{dates in the form of both \code{rm_date}, \code{rm_date2}, and \code{rm_date3}}
#' \item{rm_dollar}{substring with dollar sign ($) followed by (1) just dollars (no decimal), (2) dollars and cents (whole number and decimal), or (3) just cents (decimal value); dollars may contain commas}
#' \item{rm_email}{substring with (1) alphanumeric characters or dash (-), plus (+), or underscore (_) (\emph{This may be repeated}) (2) followed by at (@@), followed by the same regex sequence as before the at (@@), and ending with dot (.) and 2-14 digits}
#' \item{rm_emoticon}{common emoticons (logic is complicated to explain in words) using ">?[:;=8XB]\{1\}[-~+o^]?[|\")(>DO>\{pP3/]+|</?3|XD+|D:<|x[-~+o^]?[|\")(>DO>\{pP3/]+" regex pattern; general pattern is optional hat character, followed by eyes character, followed by optional nose character, and ending with a mouth character}
#' \item{rm_endmark}{substring of the last endmark group in a string; endmarks include (! ? . * OR |)}
#' \item{rm_endmark3}{substring of the last endmark group in a string; endmarks include (! ? OR .)}
#' \item{rm_endmark3}{substring of the last endmark group in a string; endmarks include (! ? . * | ; OR :)}
#' \item{rm_hash}{substring that begins with a hash (#) followed by a word}
#' \item{rm_nchar_words}{substring of letters (that may contain apostrophes) n letters long (apostrophe not counted in length); note contains \code{"\%s"} that is replaced by \code{\link[base]{sprintf}} and is not a valid regex on its own}
#' \item{rm_nchar_words2}{substring of letters (that may contain apostrophes) n letters long (apostrophe counted in length); note contains \code{"\%s"} that is replaced by \code{\link[base]{sprintf}} and is not a valid regex on its own}
#' \item{rm_non_ascii}{substring of 2 digits or letters a-f inside of a left and right angle brace in the form of \code{"<a4>"}}
#' \item{rm_non_words}{substring of any character that isn't a letter, apostrophe, or single space}
#' \item{rm_number}{substring that may begin with dash (-) for negatives, and is (1) just whole number (no decimal), (2) whole number and decimal, or (3) just decimal value; regex pattern provided by Jason Gray}
#' \item{rm_percent}{substring beginning with (1) just whole number (no decimal), (2) whole number and decimal, or (3) just decimal value and followed by a percent sign (\%)}
#' \item{rm_phone}{phone numbers in the form of optional country code, valid 3 digit prefix, and 7 digits (may contain hyphens and parenthesis); logic is complex to explain (see \url{https://stackoverflow.com/a/21008254/1000343} for more)}
#' \item{rm_postal_code}{U.S. state abbreviations (and District of Columbia) that is constrained to just possible U.S. state names, not just two consecutive capital letters; taken from Mike Hamilton's submission found \url{https://regexlib.com/REDetails.aspx?regexp_id=2177}}
#' \item{rm_repeated_characters}{substring with a repetition of repeated characters within a word; regex pattern retrieved from \href{https://stackoverflow.com}{StackOverflow}'s, \href{https://stackoverflow.com/users/3679490/vks}{vks}: \url{https://stackoverflow.com/a/29438461/1000343}}
#' \item{rm_repeated_phrases}{substring with a phrase (a sequence of 1 or more words) that is repeated 2 or more times (case is ignored; separating periods and commas are ignored); regex pattern retrieved from \href{https://stackoverflow.com}{StackOverflow}'s, \href{https://stackoverflow.com/users/2725969/brodieg}{BrodieG}: \url{https://stackoverflow.com/a/28786617/1000343}}
#' \item{rm_repeated_words}{substring with a word (marked with a boundary) that is repeat 2 or more times (case is ignored)}
#' \item{rm_tag}{substring that begins with an at (@@) followed by a word}
#' \item{rm_tag2}{Twitter substring that begins with an at (@@) followed by a word composed of alpha-numeric characters and underscores, no longer than 15 characters}
#' \item{rm_title_name}{substring beginning with title (Mrs., Mr., Ms., Dr.) that is case independent or full title (Miss, Mizz, mizz) followed by a single lower case word or multiple capitalized words}
#' \item{rm_time}{substring that (1) must begin with 0-2 digits, (2) must be followed by a single colon (:), (3) optionally may be followed by either a colon (:) or a dot (.), (4) optionally may be followed by 1-infinite digits (if previous condition is true)}
#' \item{rm_time2}{substring that is identical to \code{rm_time} with the additional search for Ante Meridiem/Post Meridiem abbreviations (e.g., AM, p.m., etc.)}
#' \item{rm_transcript_time}{substring that is specific to transcription time stamps in the form of HH:MM:SS.OS where OS is milliseconds. HH: and .OS are optional. The SS.OS period divide may also be a comma or additional colon. The HH:SS divid may also be a period. String may be affixed with pound sign (#).}
#' \item{rm_twitter_url}{\href{https://twitter.com/}{Twitter} short link/url; substring optionally beginning with \emph{http}, followed by \emph{t.co} ending on a space or end of string (whichever comes first)}
#' \item{rm_url}{substring beginning with \emph{http}, \emph{www.}, or \emph{ftp} and ending on a space or end of string (whichever comes first); note that this regex is simple and may not cover all valid URLs or may include invalid URLs}
#' \item{rm_url2}{substring beginning with \emph{http}, \emph{www.}, or \emph{ftp} and more constrained than \code{rm_url}; based on @@imme_emosol's response from \url{https://mathiasbynens.be/demo/url-regex}}
#' \item{rm_url3}{substring beginning with \emph{http} or \emph{ftp} and more constrained than \code{rm_url} & \code{rm_url2} though light-weight, making it ideal for validation purposes; taken from @@imme_emosol's response found \url{https://mathiasbynens.be/demo/url-regex}}
#' \item{rm_white}{substring of white space(s); this regular expression combines \code{rm_white_bracket}, \code{rm_white_colon}, \code{rm_white_comma}, \code{rm_white_endmark}, \code{rm_white_lead}, \code{rm_white_trail}, and \code{rm_white_multiple}}
#' \item{rm_white_bracket}{substring of white space(s) following left brackets ("\{", "(", "[") or preceding right brackets ("\}", ")", "]")}
#' \item{rm_white_colon}{substring of white space(s) preceding colon(s)/semicolon(s)}
#' \item{rm_white_comma}{substring of white space(s) preceding a comma}
#' \item{rm_white_endmark}{substring of white space(s) preceding a single occurrence/combination of period(s), question mark(s), and exclamation point(s)}
#' \item{rm_white_lead}{substring of leading white space(s)}
#' \item{rm_white_lead_trail}{substring of leading/trailing white space(s)}
#' \item{rm_white_multiple}{substring of multiple, consecutive white spaces}
#' \item{rm_white_punctuation}{substring of white space(s) preceding a comma or a single occurrence/combination of colon(s), semicolon(s), period(s), question mark(s), and exclamation point(s)}
#' \item{rm_white_trail}{substring of trailing white space(s)}
#' \item{rm_zip}{substring of 5 digits optionally followed by a dash and 4 more digits}
#' }
#' @name regex_usa
#' @section Extra: Use \code{qdapRegex:::examine_regex()} to interactively explore the
#' regular expressions in \code{regex_usa}. This will provide a browser + console
#' based break down of each regex in the dictionary.
#' @usage data(regex_usa)
#' @format A list with 54 elements
NULL
#' Supplemental Canned Regular Expressions
#'
#' A dataset containing a list of supplemental, canned regular expressions. The
#' regular expressions in this data set are considered useful but have not been
#' included in a formal function (of the type \code{rm_XXX}). Users can utilize
#' the \code{rm_} function to generate functions that can sub/replace/extract as
#' desired.
#'
#' @docType data
#' @name regex_supplement
#' @details The following canned regular expressions are included:
#' \describe{
#' \item{after_a}{single word after the word "a"}
#' \item{after_the}{single word after the word "the"}
#' \item{after_}{find single word after ? word (? = user defined); note contains \code{"\%s"} that is replaced by \code{\link[base]{sprintf}} and is not a valid regex on its own (user supplies (1) n before, (2) the point, & (3) n after)}
#' \item{around_}{find n words (not including punctuation) before or after ? word (? = user defined); note contains \code{"\%s"} that is replaced by \code{\link[base]{sprintf}} and is not a valid regex on its own (user supplies (1) n before, (2) the point, & (3) n after)}
#' \item{around2_}{find n words (plus punctuation) before or after ? word (? = user defined); note contains \code{"\%s"} that is replaced by \code{\link[base]{sprintf}} and is not a valid regex on its own}
#' \item{before_}{find sing word before ? word (? = user defined); note contains \code{"\%s"} that is replaced by \code{\link[base]{sprintf}} and is not a valid regex on its own}
#' \item{except_first}{find all occurrences of a substring except the first; regex pattern retrieved from \href{https://stackoverflow.com/users/3732271/akrun}{StackOverflow's akrun}: \url{https://stackoverflow.com/a/31458261/1000343}}
#' \item{hexadecimal}{substring beginning with hash (#) followed by either 3 or 6 select characters (a-f, A-F, and 0-9)}
#' \item{ip_address}{substring of four chunks of 1-3 consecutive digits separated with dots (.)}
#' \item{last_occurrence}{last occurrence of a delimiter; note contains \code{"\%s"} that is replaced by \code{\link[base]{sprintf}} and is not a valid regex on its own (user supplies the delimiter)}
#' \item{pages}{substring with "pp." or "p.", optionally followed by a space, followed by 1 or more digits, optionally followed by a dash, optionally followed by 1 or more digits, optionally followed by a semicolon, optionally followed by a space, optionally followed by 1 or more digits; intended for extraction/removal purposes}
#' \item{pages2}{substring 1 or more digits, optionally followed by a dash, optionally followed by 1 or more digits, optionally followed by a semicolon, optionally followed by a space, optionally followed by 1 or more digits; intended for validation purposes}
#' \item{punctuation}{punctuation characters (\code{[:punct:]}) with the ability to negate; note contains \code{"\%s"} that is replaced by \code{\link[base]{sprintf}} and is not a valid regex on its own}
#' \item{run_split}{a regex that is useful for splitting strings in the characters runs (e.g., "wwxyyyzz" becomes "ww", "x", "yyy", "zz"); regex pattern retrieved from \href{https://stackoverflow.com/users/2994949/rawr}{Robert Redd}: \url{https://stackoverflow.com/a/29383435/1000343}}
#' \item{split_keep_delim}{regex string that splits on a delimiter and retains the delimiter}
#' \item{thousands_separator}{chunks digits > 4 into groups of 3 from right to left allowing for easy insertion of thousands separator; regex pattern retrieved from \href{https://stackoverflow.com/}{StackOverflow}'s stema: \url{https://stackoverflow.com/a/10612685/1000343}}
#' \item{time_12_hours}{substring of valid hours (1-12) followed by a colon (:) followed by valid minutes (0-60), followed by an optional space and the character chunk \emph{am} or \emph{pm}}
#' \item{version}{substring starting with "v" or "version" optionally followed by a space and then period separated digits for <major>.<minor>.<release>.<build>; the build sequence is optional and the "version"/"v" IS NOT contained in the substring}
#' \item{version2}{substring starting with "v" or "version" optionally followed by a space and then period separated digits for <major>.<minor>.<release>.<build>; the build sequence is optional and the "version"/"v" IS contained in the substring}
#' \item{white_after_comma}{substring of white space after a comma}
#' \item{word_boundary}{A true word boundary that only includes alphabetic characters; based on \url{https://www.rexegg.com/}'s suggestion taken from \href{https://www.rexegg.com/regex-boundaries.html#real-word-boundary}{discussion of true word boundaries}; note contains \code{"\%s"} that is replaced by \code{\link[base]{sprintf}} and is not a valid regex on its own}
#' \item{word_boundary_left}{A true left word boundary that only includes alphabetic characters; based on \url{https://www.rexegg.com/}'s suggestion taken from \href{https://www.rexegg.com/regex-boundaries.html#real-word-boundary}{discussion of true word boundaries}}
#' \item{word_boundary_right}{A true right word boundary that only includes alphabetic characters; based on \url{https://www.rexegg.com/}'s suggestion taken from \href{https://www.rexegg.com/regex-boundaries.html#real-word-boundary}{discussion of true word boundaries}}
#' \item{youtube_id}{substring of the video id from a \href{https://www.youtube.com}{YouTube} video; taken from Jacob Overgaard's submission found https://regex101.com/r/kU7bP8/1}
#' }
#'
#' Regexes from this data set can be added to the \code{pattern} argument of any
#' \code{rm_XXX} function via an at sign (@@) followed by a regex name from
#' this data set (e.g., \code{pattern = "@@after_the"}) provided the regular
#' expression does not contain non-regex such as \code{\link[base]{sprintf}}
#' character string \code{\%s}.
#' @section Warning: Note that regexes containing \code{\%s} are replaced by
#' \code{\link[base]{sprintf}} and are not a valid regex on their own. The
#' \code{\link[qdapRegex]{S}} is useful for adding these missing \code{\%s}
#' parameters.
#' @usage data(regex_supplement)
#' @details Use \code{qdapRegex:::examine_regex(regex_supplement)} to
#' interactively explore the regular expressions in \code{regex_usa}. This will
#' provide a browser + console based break down of each regex in the dictionary.
#' @format A list with 24 elements
#' @examples
#' time <- rm_(pattern="@@time_12_hours")
#' time("I will go at 12:35 pm")
#'
#' x <- "v6.0.156 for Windows 2000/2003/XP/Vista
#' Server version 1.1.20
#' Client Manager version 1.1.24"
#'
#' rm_default(x, pattern = "@@version", extract=TRUE)
#' rm_default(x, pattern = "@@version2", extract=TRUE)
#'
#' x <- "this is 1000000 big 4356. And little 123 number."
#' rm_default(x, pattern="@@thousands_separator", replacement="\\1,")
#' rm_default(x, pattern="@@thousands_separator", replacement="\\1.")
#'
#' rm_default("I was,but it costs 10,000.", pattern="@@white_after_comma",
#' replacement=", ")
#'
#' x <- "I like; the donuts; a lot"
#' strsplit(x, ";")
#' strsplit(x, S(grab("split_keep_delim"), ";"), perl=TRUE)
#' stringi::stri_split_regex(x, S(grab("split_keep_delim"), ";"))
#' stringi::stri_split_regex("I like; the donuts; a lot:cool",
#' S(grab("split_keep_delim"), ";|:"))
#'
#' ## Grab words around a point
#' x <- c(
#' "the magic word is e",
#' "the dog is red and they are blue",
#' "I am new but she is not new",
#' "hello world",
#' "why is it so cold? Perhaps it is Winter.",
#' "It is not true the 7 is 8.",
#' "Is that my drink?"
#' )
#'
#' rm_default(x, pattern = S("@@around_", 1, "is", 1), extract=TRUE)
#' rm_default(x, pattern = S("@@around_", 2, "is", 2), extract=TRUE)
#' rm_default(x, pattern = S("@@around_", 1, "is|are|am", 1), extract=TRUE)
#' rm_default(x, pattern = S("@@around_", 1, "is not|is|are|am", 1), extract=TRUE)
#' rm_default(x, pattern = S("@@around_", 1,
#' "is not|[Ii]s|[Aa]re|[Aa]m", 1), extract=TRUE)
#'
#' x <- c(
#' "hello world",
#' "45",
#' "45 & 5 makes 50",
#' "x and y",
#' "abc and def",
#' "her him foo & bar for Jack and Jill then"
#' )
#'
#' around_and <- rm_(pattern = S("@@around_", 1, "and|\\&", 1), extract=TRUE)
#' around_and(x)
#'
#' ## Split runs into chunks
#' x <- "1111100000222000333300011110000111000"
#' strsplit(x, grab("@@run_split"), per = TRUE)
#'
#' \dontrun{
#' library(qdap);library(ggplot2);library(reshape2)
#'
#' out <- setNames(lapply(c("@@after_a", "@@after_the"), function(x) {
#' o <- rm_default(stringi:::stri_trans_tolower(pres_debates2012$dialogue),
#' pattern = x, extract=TRUE)
#' m <- qdapTools::matrix2df(data.frame(freq=sort(table(unlist(o)), TRUE)), "word")
#' m[m$freq> 7, ]
#' }), c("a", "the"))
#'
#'
#' dat <- setNames(Reduce(function(x, y) {
#' merge(x, y, by = "word", all = TRUE)}, out), c("Word", "A", "THE"))
#'
#' dat <- reshape2::melt(dat, id="Word", variable.name="Article", value.name="freq")
#'
#' dat <- dat[order(dat$freq, dat$Word), ]
#'
#' ord <- aggregate(freq ~ Word, dat, sum)
#'
#' dat$word <- factor(dat$Word, levels=ord[order(ord[[2]]), 1])
#' ggplot(dat, aes(x=freq, y=Word)) + geom_point()+ facet_grid(~Article)
#' }
#'
#' ## remove/extract pages numbers
#' x <- c("I read p. 36 and then pp. 45-49", "it's on pp. 23-24;28")
#'
#' rm_pages <- rm_(pattern="@@pages", extract=TRUE)
#' rm_pages(x)
#'
#' rm_default(x, pattern = "@@pages")
#' rm_default(x, pattern = "@@pages", extract=TRUE)
#' rm_default(x, pattern = "@@pages2", extract=TRUE)
#'
#' ## Validate pages
#' page_val <- validate("@@pages2", FALSE)
#' page_val(c(66, "78-82", "hello world", TRUE, "44-45; 56"))
#'
#' ## Split on last occurrence
#' x <- c(
#' "test@@aol@@fg.mm.com",
#' "test@@hotmail.com",
#' "test@@xyz@@rr@@lk.edu",
#' "test@@abc.xx@@zz.vv.net"
#' )
#'
#' strsplit(x, S("@@last_occurrence", "\\."), perl=TRUE)
#' strsplit(x, S("@@last_occurrence", "@@"), perl=TRUE)
#'
#' ## True Word Boundaries
#' x <- "this is _not a word666 and this is not a word too."
#' ## Standard regex word boundary
#' rm_default(x, pattern=bind("not a word"))
#' ## Alphabetic only word boundaries
#' rm_default(x, pattern=S("@@word_boundary", "not a word"))
#'
#' ## Remove all but first occurrence of something
#' x <- c(
#' "12-3=4-5=678-9",
#' "ABC-D=EF2-GHI-JK3=L-MN=",
#' "9-87=65",
#' "a - de=4fgh --= i5jkl",
#' NA
#' )
#'
#' rm_default(x, pattern = S("@@except_first", "-"))
#' rm_default(x, pattern = S("@@except_first", "="))
NULL
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.