Nothing
#' Regular expression to match numbers in English
#'
#' A regex pattern to identify natural language English number phrases, such as
#' "one hundred and fifty" or "thirty-seven". Used internally by
#' \code{\link{replace_numbers}} to identify substrings to replace with their
#' decimal representation.
#'
#' This is a PCRE (Perl type) regular expression, so it must be used with
#' \code{perl = TRUE} in base R regex functions. The packages \code{stringr}
#' and \code{stringi} are based on the alternative ICU regular expression
#' engine, so they cannot use this pattern.
#'
#' @note
#' There is limited support for fractional expressions like "one half".
#' The original pattern did not support expressions like "a thousand", but
#' it has been adapted to offer (experimental) support for this.
#' Phrases like "million" or "thousand" with no prefix will \emph{not} match.
#'
#' @source \url{https://www.rexegg.com/regex-trick-numbers-in-english.html}
regex_numbers <- "(?x) # free-spacing mode
(?(DEFINE)
# Within this DEFINE block, we'll define many subroutines
# They build on each other like lego until we can define
# a 'big number'
(?<one_to_9>
# The basic regex:
# one|two|three|four|five|six|seven|eight|nine
# We'll use an optimized version:
# Option 1: four|eight|(?:fiv|(?:ni|o)n)e|t(?:wo|hree)|
# s(?:ix|even)
# Option 2:
(?:f(?:ive|our)|s(?:even|ix)|t(?:hree|wo)|(?:ni|o)ne|eight)
) # end one_to_9 definition
(?<ten_to_19>
# The basic regex:
# ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|
# eighteen|nineteen
# We'll use an optimized version:
# Option 1: twelve|(?:(?:elev|t)e|(?:fif|eigh|nine|(?:thi|fou)r|
# s(?:ix|even))tee)n
# Option 2:
(?:(?:(?:s(?:even|ix)|f(?:our|if)|nine)te|e(?:ighte|lev))en|
t(?:(?:hirte)?en|welve))
) # end ten_to_19 definition
(?<two_digit_prefix>
# The basic regex:
# twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety
# We'll use an optimized version:
# Option 1: (?:fif|six|eigh|nine|(?:tw|sev)en|(?:thi|fo)r)ty
# Option 2:
(?:s(?:even|ix)|t(?:hir|wen)|f(?:if|or)|eigh|nine)ty
) # end two_digit_prefix definition
(?<one_to_99>
(?&two_digit_prefix)(?:[- ](?&one_to_9))?|(?&ten_to_19)|
(?&one_to_9)
) # end one_to_99 definition
(?<one_to_999>
(?:(?&one_to_9)|a)[ ]hundred(?:[ ](?:and[ ])?(?&one_to_99))?|
(?&one_to_99)
) # end one_to_999 definition
(?<one_to_999_999>
(?:(?&one_to_999)|a)[ ]thousand(?:[ ](?&one_to_999))?|
(?&one_to_999)
) # end one_to_999_999 definition
(?<one_to_999_999_999>
(?:(?&one_to_999)|a)[ ]million(?:[ ](?&one_to_999_999))?|
(?&one_to_999_999)
) # end one_to_999_999_999 definition
(?<one_to_999_999_999_999>
(?:(?&one_to_999)|a)[ ]billion(?:[ ](?&one_to_999_999_999))?|
(?&one_to_999_999_999)
) # end one_to_999_999_999_999 definition
(?<one_to_999_999_999_999_999>
(?:(?&one_to_999)|a)[ ]trillion(?:[ ](?&one_to_999_999_999_999))?|
(?&one_to_999_999_999_999)
) # end one_to_999_999_999_999_999 definition
(?<bignumber>
zero|(?&one_to_999_999_999_999_999)
) # end bignumber definition
(?<zero_to_9>
(?&one_to_9)|zero
) # end zero to 9 definition
(?<decimals>
point(?:[ ](?&zero_to_9))+
) # end decimals definition
(?<fractions>
(?:a[ ]|(?&one_to_9)[ ])?(?:\\b(?:hal(?:f|ves)|thirds?|quarters?|fifths?))
) # end fractions definition
(?<mixed_fractions>
(?:(?:(?&bignumber)|\\d+)(?:[ ])(?:and|[&])[ ])?(?&fractions)
) # end mixed fraction definition
) # End DEFINE
####### The Regex Matching Starts Here ########
(?&mixed_fractions)|(?&bignumber)(?:[ ](?&decimals))?
### Other examples of groups we could match ###
#(?&bignumber)
# (?&one_to_99)
# (?&one_to_999)
# (?&one_to_999_999)
# (?&one_to_999_999_999)
# (?&one_to_999_999_999_999)
# (?&one_to_999_999_999_999_999)"
#' Replace English number phrases with their decimal representations
#'
#' Uses \code{\link{numb_replacements}} to match parts of a string corresponding
#' to numbers, then invokes \code{\link{words2number}} to convert these
#' substrings to numeric. The rest of the string (the non-number words) is
#' left intact.
#'
#' Works on non-negative integer numbers under one billion
#' (one thousand million). Does not support fractions or decimals (yet).
#'
#' @param string A character vector. Can contain numbers and other text
#'
#' @return A character vector the same length as \code{string}, with words
#' replaced by their decimal representations.
#'
#' @examples
#' replace_numbers('Two plus two equals four')
#' replace_numbers('one hundred thousand dollars!')
#' replace_numbers(c('A vector', 'containing numbers', 'like thirty seven'))
#'
#' @seealso
#' \code{\link{words2number}}, for use on cleaned text that does not contain
#' any non-number words
#'
#' @export
replace_numbers <- function(string) {
#string <- str_remove_all(string, '(?:\\band|&)[ ]?')
matches <- gregexpr(regex_numbers, string, perl = TRUE, ignore.case = TRUE)
regmatches(string, matches) <- lapply(regmatches(string, matches), words2number)
string
}
#' Dictionary of English names of numbers
#'
#' For internal use in \code{\link{words2number}}. When passed as a replacement
#' to a function like
#' \code{\link[stringr:str_replace]{str_replace_all}}, it turns the
#' string into an arithmetic expression that can be evaluated to give an integer
#' representation of the named number.
#'
#' Lifted from Ben Marwick's \code{words2number} package and converted into
#' a named vector (previously a chain of \code{\link{gsub}} calls).
#'
#' @examples
#' \dontrun{
#' stringr::str_replace_all('one hundred and forty-two', numb_replacements)
#' }
#'
#' @note
#' Does not yet fully support decimals, fractions or mixed fractions.
#' Some limited support for 'half' expressions, e.g. 'one and a half'.
#'
#' @source \url{https://github.com/benmarwick/words2number}
numb_replacements <-
c('-' = ' ',
'eleven(?:th)?' = '+11',
'twel(?:ve|fth)' = '+12',
'thirteen(?:th)?' = '+13',
'fourteen(?:th)?' = '+14',
'fifteen(?:th)?' = '+15',
'sixteen(?:th)?' = '+16',
'seventeen(?:th)?' = '+17',
'eighteen(?:th)?' = '+18',
'nineteen(?:th)?' = '+19',
'twent(?:y|ieth)' = '+20',
'thirt(?:y|ieth)' = '+30',
'fort(?:y|ieth)' = '+40',
'fift(?:y|ieth)' = '+50',
'sixt(?:y|ieth)' = '+60',
'sevent(?:y|ieth)' = '+70',
'eight(?:y|ieth)' = '+80',
'ninet(?:y|ieth)' = '+90',
'(?:a|one) hundred(?:th)?' = '+100',
'two hundred(?:th)?' = '+200',
'three hundred(?:th)?' = '+300',
'four hundred(?:th)?' = '+400',
'five hundred(?:th)?' = '+500',
'six hundred(?:th)?' = '+600',
'seven hundred(?:th)?' = '+700',
'eight hundred(?:th)?' = '+800',
'nine hundred(?:th)?' = '+900',
'(?:\\b(?:a|one) )?half' = '+0.5',
'one|first|\\ba\\b' = '+1',
'second|two' = '+2',
'th(?:ree|ird)' = '+3',
'four(?:th)?' = '+4',
'fi(?:ve|fth)' = '+5',
'six(?:th)?' = '+6',
'seven(?:th)?' = '+7',
'eighth?' = '+8',
'nin(?:e|th)' = '+9',
'millions?' = ')*(1000000)+(0',
'thousand(?:th)?' = ')*(1000)+(0',
'hundred(?:th)?' = '+100',
'ten(?:th)?' = '+10',
'and|&' = '',
' ' = '',
'^' = '(0',
'$' = ')',
'\\(0\\(' = '',
'\\+\\+' = '\\+\\(',
'\\)\\+\\)' = '\\)',
# Finally remove any residual non-number words.
# Otherwise the generated arithmetic expression will not evaluate.
'[[:alpha:]]+' = ''
)
#' Convert English names of numbers to their numerical values
#'
#' @source Originally adapted from the
#' \href{https://github.com/benmarwick/words2number}{\code{words2number}} package by
#' Ben Marwick.
#'
#' @param txt A character vector containing names of numbers (only).
#'
#' @return A named numeric vector of the same length as \code{phrase}.
#'
#' @examples
#' words2number('seven')
#' words2number('forty-two')
#' words2number(c('three', 'one', 'twenty two thousand'))
#'
#' @importFrom stringr str_replace_all
#'
#' @export
words2number <- function(txt) {
if (length(txt) < 1)
return(txt)
if (any(lengths(txt) > 1))
stop('words2number does not work on nested lists')
if (!is.character(txt[[1]]))
stop('words2number should only be passed character-vector inputs')
expression <- stringr::str_replace_all(tolower(txt), numb_replacements)
result <- vapply(expression, function(e) eval(parse(text = e)), FUN.VALUE = 0)
setNames(result, txt)
}
#' List of Latin medical and pharmaceutical abbreviations
#'
#' A named character vector. Names represent Latin terms and values the English
#' translations. Used for converting terms like "q4h" into "every 4 hours",
#' which can then be parsed into a dosage frequency/interval.
#'
#' Use with a function like \code{\link[stringr:str_replace]{str_replace_all}}
#' to translate a prescription from Latin to English (thence to numbers).
#'
#' @source
#' \url{https://en.wikipedia.org/wiki/List_of_abbreviations_used_in_medical_prescriptions}
#'
#' @examples
#' stringr::str_replace_all('Take two tablets q4h', latin_medical_terms)
#'
#' @export
latin_medical_terms <- c(
`dieb alt` = 'every 2 days',
`alt hs` = 'every 2 hours', ## ?
`noc(?:te?)?` = 'at night',
`mane` = 'in the morning',
q8h = 'every 8 hours',
q7h = 'every 7 hours',
q6h = 'every 6 hours',
q5h = 'every 5 hours',
q4h = 'every 4 hours',
q3h = 'every 3 hours',
q2h = 'every 2 hours',
q1h = 'every 1 hour',
qhs = 'at bedtime',
qqh = 'every 4 hours',
qh = 'every 1 hour',
hs = 'bedtime',
bt = 'bedtime',
qam = 'every morning',
qpm = 'every afternoon',
`qds?` = 'daily',
q1d = 'daily',
qid = '4 / day',
qwk = 'every week',
`bds?` = 'twice daily',
bid = 'twice daily',
bis = 'twice',
biw = 'twice weekly',
`tds?` = 'thrice daily',
tiw = 'thrice weekly',
tid = 'thrice daily',
`alt h(?:or)?` = 'every 2 hours',
`alt d(?:\\b|ieb)` = 'every 2 days',
eod = 'every 2 days',
qad = 'every 2 days',
qod = 'every 2 days',
opd = '1 / day',
sid = '1 / day',
`\\bam\\b` = 'morning',
`\\bpm\\b` = 'afternoon',
`\\bom\\b` = 'every morning',
`\\bon\\b` = 'every night',
`\\bod\\b` = 'every day',
`sos|(?:si opus )?sit|siop` = 'as needed',
qs = 'as needed',
`prn|pro re nata` = 'as needed',
mdu = 'as directed',
`asd(?:ir)?` = 'as directed'
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.