#' @include imports.R
NULL
#' Unicode General Categories
#'
#' Match a Unicode General Category.
#' @param lo A non-negative integer. Minimum number of repeats, when grouped.
#' @param hi positive integer. Maximum number of repeats, when grouped.
#' @param char_class \code{TRUE} or \code{FALSE}. Should the values be wrapped
#' into a character class?
#' @return A character vector representing part or all of a regular expression.
#' @references Table 12 of the Unicode Standard Annex #44 defines the Unicode
#' General Categories.
#' \url{http://www.unicode.org/reports/tr44/#Property_Values}
#'
#' You can see which characters are contained in a category by visiting, e.g.,
#' \url{http://www.fileformat.info/info/unicode/category/Nd/list.htm}
#' @seealso \code{\link{unicode_property}}, \code{\link{Unicode}}
#' @examples
#' # Classes
#' ugc_lowercase_letter()
#' ugc_decimal_number()
#' ugc_paragraph_separator()
#' ugc_currency_symbol()
#'
#' # With repetition
#' ugc_nonspacing_mark(3, 6)
#' ugc_separator(1, Inf)
#' ugc_dash_punctuation(0, Inf)
#'
#' # Without a class wrapper
#' ugc_titlecase_letter(char_class = FALSE)
#'
#' # Constants
#' UGC_UPPERCASE_LETTER
#' UGC_LETTER_NUMBER
#' UGC_MATH_SYMBOL
#' UGC_FORMAT_CONTROL
#'
#' \dontrun{
#' # All the Unicode general categories.
#' # Not run, since it generates lots of output
#' ls("package:rebus.unicode", pattern = "^ugc")
#' }
#'
#' # Usage
#' library(rebus.base)
#' x <- "I exchanged $1000 for \u20ac665.41 and \u00a3243.13."
#' (rx <- capture(ugc_currency_symbol()) %R%
#' capture(
#' ugc_decimal_number(1, Inf) %R%
#' optional(group("." %R% ugc_decimal_number(2)))
#' )
#' )
#' stringi::stri_match_all_regex(x, rx)
#' @name UnicodeGeneralCategory
#' @aliases unicode_general_category
NULL
unicode_general_category <- function(x)
{
rebus.base::regex("\\p{", x, "}")
}
#' Unicode Properties
#'
#' Match a Unicode Property.
#' @param lo A non-negative integer. Minimum number of repeats, when grouped.
#' @param hi positive integer. Maximum number of repeats, when grouped.
#' @param char_class \code{TRUE} or \code{FALSE}. Should the values be wrapped
#' into a character class?
#' @return A character vector representing part or all of a regular expression.
#' @references Table 9 of the Unicode Standard Annex #44 defines the Unicode
#' Properties.
#' \url{http://www.unicode.org/reports/tr44/#PropList.txt}
#'
#' There is some more information on the motivation for properties and examples
#' of properties of a character in Annex #23.
#' \url{http://www.unicode.org/reports/tr23}
#' @seealso \code{\link{unicode_general_category}}, \code{\link{Unicode}},
#' \code{\link[stringi]{stringi-search-charclass}}
#' @examples
#' # Classes
#' up_math()
#' up_posix_alnum()
#' up_changes_when_uppercased()
#' up_diacritic()
#'
#' # With repetition
#' ugc_nonspacing_mark(3, 6)
#' up_quotation_mark(1, Inf)
#' up_posix_xdigit(0, Inf)
#'
#' # Without a class wrapper
#' up_hyphen(char_class = FALSE)
#'
#' # Constants
#' UP_ALPHABETIC
#' UP_DASH
#' UP_POSIX_ALNUM
#' UP_CHANGES_WHEN_LOWERCASED
#'
#' \dontrun{
#' # All the Unicode properties.
#' # Not run, since it generates lots of output
#' ls("package:rebus.unicode", pattern = "^up")
#' }
#'
#' # Usage
#' # Hello in Samoan, Serbian, Persian, Simplified Chinese
#' hello <- "t\u101lofa, \u437\u434\u440\u430\u432\u43e, \u633\u644\u627\u645, \u4f60\u597d"
#' stringi::stri_extract_all_regex(hello, up_alphabetic(1, Inf))
#' stringi::stri_extract_all_regex(hello, up_case_sensitive(1, Inf))
#' @name UnicodeProperty
#' @aliases unicode_property
NULL
unicode_property <- function(x, syntax = c("unicode", "posix"))
{
syntax <- match.arg(syntax)
if(syntax == "unicode")
{
rebus.base::regex("\\p{", x, "}")
} else
{
rebus.base::regex("[:", x, ":]")
}
}
# Unicode General Categories ----------------------------------------------
#' @rdname UnicodeGeneralCategory
#' @export
UGC_UPPERCASE_LETTER <- unicode_general_category("Lu")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_LOWERCASE_LETTER <- unicode_general_category("Ll")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_TITLECASE_LETTER <- unicode_general_category("Lt")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_CASED_LETTER <- unicode_general_category("LC")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_MODIFIER_LETTER <- unicode_general_category("Lm")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_OTHER_LETTER <- unicode_general_category("Lo")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_LETTER <- unicode_general_category("L")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_NONSPACING_MARK <- unicode_general_category("Mn")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_SPACING_MARK <- unicode_general_category("Mc")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_ENCLOSING_MARK <- unicode_general_category("Me")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_MARK <- unicode_general_category("M")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_DECIMAL_NUMBER <- unicode_general_category("Nd")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_LETTER_NUMBER <- unicode_general_category("Nl")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_OTHER_NUMBER <- unicode_general_category("No")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_NUMBER <- unicode_general_category("N")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_CONNECTOR_PUNCTUATION <- unicode_general_category("Pc")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_DASH_PUNCTUATION <- unicode_general_category("Pd")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_OPEN_PUNCTUATION <- unicode_general_category("Ps")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_CLOSE_PUNCTUATION <- unicode_general_category("Pe")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_INITIAL_PUNCTUATION <- unicode_general_category("Pi")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_FINAL_PUNCTUATION <- unicode_general_category("Pf")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_OTHER_PUNCTUATION <- unicode_general_category("Po")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_PUNCTUATION <- unicode_general_category("P")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_MATH_SYMBOL <- unicode_general_category("Sm")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_CURRENCY_SYMBOL <- unicode_general_category("Sc")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_MODIFIER_SYMBOL <- unicode_general_category("Sk")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_OTHER_SYMBOL <- unicode_general_category("So")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_SYMBOL <- unicode_general_category("S")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_SPACE_SEPARATOR <- unicode_general_category("Zs")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_LINE_SEPARATOR <- unicode_general_category("Zl")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_PARAGRAPH_SEPARATOR <- unicode_general_category("Zp")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_SEPARATOR <- unicode_general_category("Z")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_CONTROL <- unicode_general_category("Cc")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_FORMAT_CONTROL <- unicode_general_category("Cf")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_SURROGATE_CONTROL <- unicode_general_category("Cs")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_PRIVATE_USE_CONTROL <- unicode_general_category("Co")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_UNASSIGNED_CONTROL <- unicode_general_category("Cn")
#' @rdname UnicodeGeneralCategory
#' @export
UGC_OTHER <- unicode_general_category("O")
# Unicode Properties ------------------------------------------------------
#' @rdname UnicodeProperty
#' @export
UP_ALPHABETIC <- unicode_general_category("ALPHABETIC")
#' @rdname UnicodeProperty
#' @export
UP_ASCII_HEX_DIGIT <- unicode_property("ASCII_HEX_DIGIT")
#' @rdname UnicodeProperty
#' @export
UP_BIDI_CONTROL <- unicode_property("BIDI_CONTROL")
#' @rdname UnicodeProperty
#' @export
UP_BIDI_MIRRORED <- unicode_property("BIDI_MIRRORED")
#' @rdname UnicodeProperty
#' @export
UP_DASH <- unicode_property("DASH")
#' @rdname UnicodeProperty
#' @export
UP_DEFAULT_IGNORABLE_CODE_POINT <- unicode_property("DEFAULT_IGNORABLE_CODE_POINT")
#' @rdname UnicodeProperty
#' @export
UP_DEPRECATED <- unicode_property("DEPRECATED")
#' @rdname UnicodeProperty
#' @export
UP_DIACRITIC <- unicode_property("DIACRITIC")
#' @rdname UnicodeProperty
#' @export
UP_EXTENDER <- unicode_property("EXTENDER")
#' @rdname UnicodeProperty
#' @export
UP_HEX_DIGIT <- unicode_property("HEX_DIGIT")
#' @rdname UnicodeProperty
#' @export
UP_HYPHEN <- unicode_property("HYPHEN")
#' @rdname UnicodeProperty
#' @export
UP_ID_CONTINUE <- unicode_property("ID_CONTINUE")
#' @rdname UnicodeProperty
#' @export
UP_ID_START <- unicode_property("ID_START")
#' @rdname UnicodeProperty
#' @export
UP_IDEOGRAPHIC <- unicode_property("IDEOGRAPHIC")
#' @rdname UnicodeProperty
#' @export
UP_LOWERCASE <- unicode_property("LOWERCASE")
#' @rdname UnicodeProperty
#' @export
UP_MATH <- unicode_property("MATH")
#' @rdname UnicodeProperty
#' @export
UP_NONCHARACTER_CODE_POINT <- unicode_property("NONCHARACTER_CODE_POINT")
#' @rdname UnicodeProperty
#' @export
UP_QUOTATION_MARK <- unicode_property("QUOTATION_MARK")
#' @rdname UnicodeProperty
#' @export
UP_SOFT_DOTTED <- unicode_property("SOFT_DOTTED")
#' @rdname UnicodeProperty
#' @export
UP_TERMINAL_PUNCTUATION <- unicode_property("TERMINAL_PUNCTUATION")
#' @rdname UnicodeProperty
#' @export
UP_UPPERCASE <- unicode_property("UPPERCASE")
#' @rdname UnicodeProperty
#' @export
UP_WHITE_SPACE <- unicode_property("WHITE_SPACE")
#' @rdname UnicodeProperty
#' @export
UP_CASE_SENSITIVE <- unicode_property("CASE_SENSITIVE")
#' @rdname UnicodeProperty
#' @export
UP_POSIX_ALNUM <- unicode_property("ALNUM")
#' @rdname UnicodeProperty
#' @export
UP_POSIX_BLANK <- unicode_property("BLANK")
#' @rdname UnicodeProperty
#' @export
UP_POSIX_GRAPH <- unicode_property("GRAPH")
#' @rdname UnicodeProperty
#' @export
UP_POSIX_PRINT <- unicode_property("PRINT")
#' @rdname UnicodeProperty
#' @export
UP_POSIX_XDIGIT <- unicode_property("XDIGIT")
#' @rdname UnicodeProperty
#' @export
UP_CASED <- unicode_property("CASED")
#' @rdname UnicodeProperty
#' @export
UP_CASE_IGNORABLE <- unicode_property("CASE_IGNORABLE")
#' @rdname UnicodeProperty
#' @export
UP_CHANGES_WHEN_LOWERCASED <- unicode_property("CHANGES_WHEN_LOWERCASED")
#' @rdname UnicodeProperty
#' @export
UP_CHANGES_WHEN_UPPERCASED <- unicode_property("CHANGES_WHEN_UPPERCASED")
#' @rdname UnicodeProperty
#' @export
UP_CHANGES_WHEN_TITLECASED <- unicode_property("CHANGES_WHEN_TITLECASED")
#' @rdname UnicodeProperty
#' @export
UP_CHANGES_WHEN_CASEFOLDED <- unicode_property("CHANGES_WHEN_CASEFOLDED")
#' @rdname UnicodeProperty
#' @export
UP_CHANGES_WHEN_CASEMAPPED <- unicode_property("CHANGES_WHEN_CASEMAPPED")
#' @rdname UnicodeProperty
#' @export
UP_CHANGES_WHEN_NFKC_CASEFOLDED <- unicode_property("CHANGES_WHEN_NFKC_CASEFOLDED")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.