#' words_to_numbers
#'
#'
#' words_to_numbers takes a character string and replaces numbers
#' written in words with numerics.
#'
#'
#' @param string a character string
#'
#' @return Returns a string with any detected numbers written as words replaced by numbers
#'
#' @examples
#' library(wordstonumbers)
#' words_to_numbers("ninety-nine red balloons")
#'
#' words_to_numbers("twelve")
#'
#' words_to_numbers("Forty-three thousand")
#'
#' words_to_numbers("There were one thousand, two hundred and thirty. Twelve.")
#'
#' example <-
#' "The PRQ is a twelve-item, four-point Likert scale
#' (from one = Never to four = Very Often) with three sub-scores
#' bullying (PRQ-Bully), being victimized (PRQgroup-Victim),
#' and pro-social behavior (PRQ-Prosocial). A translated,
#' backtranslated final Arabic version of the scale was found
#' to be accurate showing good internal consistency in this
#' sample [PRQ-Victim (alpha = .seventy four)
#' and PRQ-Bullies (alpha = seventy-four)]."
#'
#' words_to_numbers(example)
#'
#' @importFrom rlang .data
#'
#' @export
words_to_numbers <- function(string) {
if(length(unlist(string)) > 1 | class(string) != "character") {
warning("The argument which was passed to words_to_numbers is not a length 1 character element, only the first element has been used here. Consider using the apply or purrr::map functions to assess multiple elements at once.")
string <- unlist(string)[[1]]
}
### Setting up constants
UNITS <- list(
zero = 0,
one = 1,
two = 2,
thirteen = 13,
three = 3,
fourteen = 14,
four = 4,
fifteen = 15,
five = 5,
sixteen = 16,
six = 6,
seventeen = 17,
seven = 7,
eighteen = 18,
eight = 8,
nineteen = 19,
nine = 9,
ten = 10,
eleven = 11,
twelve = 12
)
TENS <- list(
ten = 10,
twenty = 20,
thirty = 30,
forty = 40,
fifty = 50,
sixty = 60,
seventy = 70,
eighty = 80,
ninety = 90
)
MAGNITUDES <- list(
hundred = 100,
thousand = 1000,
million = 1000000,
billion = 1000000000,
trillion = 1000000000000,
quadrillion = 1000000000000000,
quintillion = 1000000000000000000,
sextillion = 1000000000000000000000,
septillion = 1000000000000000000000000,
octillion = 1000000000000000000000000000,
nonillion = 1000000000000000000000000000000,
decillion = 1000000000000000000000000000000000
)
NUMBER <- c(UNITS, TENS, MAGNITUDES)
UNIT_NAMES <- names(UNITS)
TEN_NAMES <- names(TENS)
MAGNITUDE_NAMES <- names(MAGNITUDES)
NUMBER_NAMES <- c(UNIT_NAMES, TEN_NAMES, MAGNITUDE_NAMES)
# Splitting in to tokens at punctuation
stringSplitVec <-
stringr::str_split(string,
"((?<=[[!\"\\#$%&'()*+,\\-/:;<=>?@\\[\\\\\\]^_`{|}~]|\\s]))|((?=[[!\"\\#$%&'()*+,\\-/:;<=>?@\\[\\\\\\]^_`{|}~]|\\s]))|((?<![\\d])(?<=\\.)(?![\\d]))|((?<![\\d])(?=\\.)(?![\\d]))",
simplify = T)
# create binaries for whitespace or punctuation
punctuationBinary <-
stringr::str_detect(stringSplitVec, "[[:punct:]]|\\s|(^and$)")
# Ensuring that "." breaks numbers apart by counting it as non-punctuation
punctuationBinary[stringr::str_detect(stringSplitVec, "\\.")] <-
FALSE
### Detecting numbers
# Detecting numerics
numericBinary <-
!is.na(suppressWarnings(as.numeric(stringSplitVec)))
# Detecting numbers - i.e., any words that match any of the NUMBER_WORDS
numberBinary <-
stringr::str_detect(stringSplitVec, stringr::regex(
paste("^", NUMBER_NAMES, "$", collapse = "|", sep = ""),
ignore_case = T
))
# Converting all characters in the number areas into ascii characters
stringSplitVec[numberBinary] <- stringi::stri_trans_general( stringSplitVec[numberBinary], "latin-ascii")
# Quick exit if there are no numbers words detected
if (sum(numberBinary[!numericBinary]) < 1) {
return(string)
}
# And quick exit if there is only one number word pressent
if (length(stringSplitVec) == 1) {
return(as.character(format(NUMBER[[tolower(stringSplitVec)]], scientific = F)))
}
# Making tibble
stringSplit <-
tibble::tibble(
id = 1:length(numberBinary),
stringSplit = as.character(stringSplitVec),
punctuationBinary,
# Note that this counts numbers as words or numbers as digits as numbers
numberBinary = numericBinary | numberBinary,
numericBinary
)
stringSplit$group <- NA
# Using cumulative sum to count the number of non-number items, not counting punctuation
stringSplit$group[!stringSplit$punctuationBinary] <-
cumsum(!stringSplit$numberBinary[!stringSplit$punctuationBinary])
# Removing dots and other words from contention to ensure that groups get broken at points
stringSplit$group[stringr::str_detect(stringSplit$stringSplit, "^\\.$")] <-
-1
stringSplit$group <-
ifelse((stringSplit$numberBinary |
stringSplit$punctuationBinary),
stringSplit$group,
-1
)
# Filling in all NAs between two other identical values -
# this means that each set of number words is grouped together
stringSplit$group <-
ifelse(
tidyr::fill(stringSplit, "group", .direction = "down")$group == tidyr::fill(stringSplit, "group", .direction = "up")$group,
tidyr::fill(stringSplit, "group", .direction = "down")$group,
NA
)
# Checking for [number] [point] [number] and converting it into "number.number"
last_position <- nrow(stringSplit)
stringSplit$point <- stringr::str_detect(stringSplit$stringSplit,
stringr::regex("point|dot",
ignore_case = T))
stringSplit$tokenAheadPoint <- c(stringSplit$point[-1], F)
stringSplit$tokenBehindPoint <-
c(F, stringSplit$point[-last_position])
stringSplit$space <- stringSplit$stringSplit == " "
stringSplit$tokenAheadSpace <- c(stringSplit$space[-1], F)
stringSplit$tokenBehindSpace <-
c(F, stringSplit$space[-last_position])
stringSplit$tokenAheadNumber <- c(stringSplit$numberBinary[-1], F)
stringSplit$tokenBehindNumber <-
c(F, stringSplit$numberBinary[-last_position])
stringSplit$stringSplit <-
ifelse(c(
# If the token is a space AND
stringSplit$space &
# ... the token following the space is a point and before is a number, and the point is followed by anything, and then a number
(
stringSplit$tokenAheadPoint &
stringSplit$tokenBehindNumber &
c(stringSplit$tokenAheadNumber[-(1:2)], F, F)
) |
# ... the token before the value is a space AND the token ahead is a point, which is in turn followed by a number (three ahead))))
(
stringSplit$tokenAheadNumber & stringSplit$tokenBehindPoint
)
)
,
"", stringSplit$stringSplit)
# Replace "dot" or "point" with points (as eg "ten point five")
stringSplit$stringSplit <-
ifelse(
c(T, T, stringSplit$numberBinary[-((last_position - 1):last_position)]) &
c(stringSplit$numberBinary[-c(1, 2)], F, F) &
stringSplit$point,
".",
stringSplit$stringSplit
)
# initiallising number vector
stringSplit$number <-
ifelse(stringSplit$numericBinary,
yes = suppressWarnings(as.numeric(stringSplit$stringSplit)),
no = NA)
# Filtering down to just those groups that have numerics
numericStrings <- dplyr::filter(stringSplit, .data$group > -1)
# Identifying the types of each number
numericStrings$magnitudeType <-
stringr::str_detect(numericStrings$stringSplit,
stringr::regex(paste0("^", MAGNITUDE_NAMES, "$", collapse = "|"),
ignore_case = T)) |
ifelse(
!is.na(numericStrings$number),
as.numeric(numericStrings$number) %in% MAGNITUDES,
F
)
numericStrings$tenType <-
stringr::str_detect(numericStrings$stringSplit,
stringr::regex(paste0("^", TEN_NAMES, "$", collapse = "|"),
ignore_case = T)) |
ifelse(!is.na(numericStrings$number),
(
as.numeric(numericStrings$number) == 10 |
(as.numeric(numericStrings$number) %in% TENS)
)
, F)
numericStrings$unitType <-
stringr::str_detect(numericStrings$stringSplit,
stringr::regex(paste0("^", UNIT_NAMES, "$", collapse = "|"),
ignore_case = T)) |
ifelse(
!is.na(numericStrings$number),
# Counting as a unit if the token is a digit between 0 - 99 but not a 10
as.numeric(numericStrings$number) %in% seq(0, 99)[-c(seq(11, 91, by = 10))],
F
)
# Helper function to check whether the token is a number or matches a number and returns either the numeric or the string
token_to_number <- function(tokens) {
tokens <- tolower(tokens)
unlist(purrr::map(tokens,
function(token) {
if (is.null(NUMBER[token][[1]])) {
return(as.numeric(token))
} else {
NUMBER[token]
}
}))
}
# Breaking apart groups according to the following rules
for (groups in unique(numericStrings$group)) {
# Extracting numbers only
numericsOnly <-
dplyr::filter(numericStrings,
numericStrings$numberBinary,
numericStrings$group ==
groups)
if (nrow(numericsOnly) < 2) {
# If there is only one element it doesn't need to change
numericsOnly$tochange <- FALSE
# Else if there are multiple, we need to check that there are not any illegal characters
} else if (nrow(numericsOnly) > 1) {
#L <- 1:nrow(numericsOnly)
pairs_to_test <-
tibble::tibble(e1 = 1:(nrow(numericsOnly) - 1),
e2 = 2:nrow(numericsOnly))
# Figuring out break points, i.e., splitting where there are (note that
# this takes e2 as the position to split at, i.e., e1 is the preceeding
# value and e2 is the one which is broken at)
numericsOnly$tochange <- c(
FALSE,
# Breaking to new number if a unit type (1-19) was preceeded by a unit type (1-19)
(numericsOnly$unitType[pairs_to_test$e1] &
numericsOnly$unitType[pairs_to_test$e2]) |
# Also breaking if a ten type (10,20,30, ..., 90) of was preceeded by a unit type
(numericsOnly$unitType[pairs_to_test$e1] &
numericsOnly$tenType[pairs_to_test$e2]) |
# And breatking if a a ten type is preceeded by a ten type
(numericsOnly$tenType[pairs_to_test$e1] &
numericsOnly$tenType[pairs_to_test$e2]) |
# And breatking if a a ten type is preceeded by a unit type above 10
(numericsOnly$tenType[pairs_to_test$e1] &
numericsOnly$unitType[pairs_to_test$e2]) &
token_to_number(numericsOnly$stringSplit[pairs_to_test$e2]) > 10 |
# Adding break if a larger number in digits is followed by a word in
# numbers of a smaller magnitude (e.g., "1000 hundred" or "100 ten")
(
numericsOnly$numericBinary[pairs_to_test$e1] &
token_to_number(numericsOnly$stringSplit[pairs_to_test$e2]) <=
token_to_number(numericsOnly$stringSplit[pairs_to_test$e1])
) |
# Breaking if two digits are consecutive (e.g., "20, 1" vs "21")
(
numericsOnly$numericBinary[pairs_to_test$e1] &
numericsOnly$numericBinary[pairs_to_test$e2]
) |
# And breaking if a number is preceded by itself
(tolower(numericsOnly$stringSplit[pairs_to_test$e1]) ==
(
tolower(numericsOnly$stringSplit[pairs_to_test$e2])
)) |
# Breaking if a magnitude is preceded by a magnitude of a larger magnitude
token_to_number(numericsOnly$stringSplit[pairs_to_test$e1]) > 100 &
token_to_number(numericsOnly$stringSplit[pairs_to_test$e1]) >
token_to_number(numericsOnly$stringSplit[pairs_to_test$e2]) &
numericsOnly$magnitudeType[pairs_to_test$e1] &
numericsOnly$magnitudeType[pairs_to_test$e2]
)
}
# This part checks triplets
if (nrow(numericsOnly) > 2) {
triplets_to_test <-
tibble::tibble(
e1 = 1:(nrow(numericsOnly) - 2),
e2 = 2:(nrow(numericsOnly) - 1),
e3 = 3:nrow(numericsOnly)
)
numericsOnly$tochange <- c(
# Note that unlike the doubles, this "centers" on the middle value
# (i.e., e2 is still the value at which the break happens, not the last value)
F,
# This breaks if:
# a mangnitude is followed by a magnitude, and the latter magnitude is larger than the first
# (e.g., "twenty thousand, one million" as compared to "one million, twenty thousand")
# unless the lower number is a hundred in which case we let it slide
# (because, for example "one hundred twenty thousand" can be parsed as a single number)
# This carves out some exceptions for when xs of hundreds of thousands
# And the number before the hundred is below ten
c(
T,
!(
numericsOnly$unitType[triplets_to_test$e1 - 1] &
token_to_number(numericsOnly$stringSplit[triplets_to_test$e1 -
1]) < 10 &
token_to_number(numericsOnly$stringSplit[triplets_to_test$e1[-1]]) == 100 &
token_to_number(numericsOnly$stringSplit[triplets_to_test$e3[-1]]) > 100
)
) &
(
token_to_number(numericsOnly$stringSplit[triplets_to_test$e1]) <
token_to_number(numericsOnly$stringSplit[triplets_to_test$e3]) &
numericsOnly$magnitudeType[triplets_to_test$e1] &
numericsOnly$magnitudeType[triplets_to_test$e3]
) |
# Also breaks tokens apart if the token before is equal to the token afterwards, AND
# The token in the middle is less than either the one before or after
# And the end tokens are both magnitudes e.g., "1.6 million three million"
(token_to_number(numericsOnly$stringSplit[triplets_to_test$e1]) ==
token_to_number(numericsOnly$stringSplit[triplets_to_test$e3]) &
(token_to_number(numericsOnly$stringSplit[triplets_to_test$e2]) <
token_to_number(numericsOnly$stringSplit[triplets_to_test$e1]) |
token_to_number(numericsOnly$stringSplit[triplets_to_test$e2]) <
token_to_number(numericsOnly$stringSplit[triplets_to_test$e3])) &
numericsOnly$magnitudeType[triplets_to_test$e1] &
numericsOnly$magnitudeType[triplets_to_test$e3])
,
F
) | numericsOnly$tochange
}
# coercion to character to prevent Can't convert <character> to <double> error
numericStrings$group <- as.character(numericStrings$group)
# Updating numeric strings with updated groups
numericsOnly$group <-
stringr::str_c("a", numericsOnly$group, cumsum(numericsOnly$tochange))
numericStrings[match(numericsOnly$id, numericStrings$id), ] <-
dplyr::select(numericsOnly,-tochange)
}
# Dropping unchanging tokens (i.e., those that are not required, like punctuation that should not be altered)
numericStrings <- dplyr::filter(numericStrings,!is.na(groups))
# Reassigning the now ungrouped non-numerics
numericStrings$group[!stringr::str_detect(numericStrings$group, "^a\\d")] <- NA
numericStrings$group <-
ifelse(
tidyr::fill(numericStrings, .data$group, .direction = "down")$group == tidyr::fill(numericStrings, .data$group, .direction = "up")$group,
tidyr::fill(numericStrings, .data$group, .direction = "down")$group,
NA
)
# Helper function for assessing each group of numbers
identifyNumbers <- function(processedNumerics) {
# Extracting numbers only
numericsOnly <- dplyr::filter(processedNumerics, numberBinary)
# Creating numbers columns
numericsOnly$number[is.na(numericsOnly$number)] <-
as.numeric(NUMBER[match(tolower(numericsOnly$stringSplit[is.na(numericsOnly$number)]), NUMBER_NAMES)])
# Copying for tracking of original numbers, which impact the way that things are summed up
numericsOnly$oldNumber <- numericsOnly$number
# For all magnitiude types, count all smaller magnitude types as multipliers of the magnitude value
if (sum(numericsOnly$magnitudeType) > 0) {
for (position in which(numericsOnly$magnitudeType)) {
startCountingFrom <-
ifelse(any(numericsOnly$oldNumber[1:(position - 1)][numericsOnly$magnitudeType[1:(position -
1)]] > numericsOnly$oldNumber[position]),
max(
which(numericsOnly$oldNumber[1:(position - 1)] > numericsOnly$oldNumber[position]) + 1
),
1)
if (position > 1) {
# if the position is not the first value, then take the sum of all previous values
previousSum <-
sum(numericsOnly$number[startCountingFrom:(position - 1)]) # begining or previous highest magnitude
# And then take the product of the previous values and the current value
value <- previousSum * numericsOnly$number[position]
# Setting the current value equal to that
numericsOnly$number[position] <- value
# And setting the now summed up values to be equal to 0
numericsOnly$number[startCountingFrom:(position - 1)] <- 0
}
}
}
# Finally, the number is equal to the sum of all of the constitute numbers
return(format(sum(numericsOnly$number), scientific = F))
}
numericedOutput <- stringSplit
# for each group of numbers
for (groups in unique(stats::na.omit(numericStrings$group))) {
ids <- dplyr::filter(numericStrings, .data$group == groups)$id
# Blanking out the non-used numbers and repacing strings with numbers
numericedOutput$number[numericedOutput$id %in% ids][1] <-
identifyNumbers( dplyr::filter(numericStrings, group == groups ) )
# Replacing strings with the appropraite values, after removing all old text
numericedOutput$stringSplit[numericedOutput$id %in% ids] <- ""
numericedOutput$stringSplit[numericedOutput$id %in% ids][1] <-
numericedOutput$number[numericedOutput$id %in% ids][1]
}
return(paste0(numericedOutput$stringSplit, collapse = ""))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.