## HAS_TESTS
## If 'x' consists of integer-like values
## where the unique, sorted
## values form a series 0, 5, 10, ..., A
## (A >= 50), assume that these represent
## age groups "0-4", "5-9", "10-14", ..., "A+";
## otherwise return NULL.
clean_age_5 <- function(x) {
if (length(x) < 11L)
return(NULL)
is_obs <- !is.na(x)
x_int <- suppressWarnings(as.integer(x))
if (anyNA(x_int[is_obs]))
return(NULL)
if (any(x_int[is_obs] != x[is_obs]))
return(NULL)
breaks <- sort(unique(x_int[is_obs]))
n <- length(breaks)
if (n < 11L)
return(NULL)
s <- seq.int(from = 0L, by = 5L, length.out = n)
if (!identical(breaks, s))
return(NULL)
labels <- paste(breaks, breaks + 4L, sep = "-")
labels[[n]] <- paste0(breaks[[n]], "+")
i <- match(x_int, breaks)
labels[i]
}
## HAS_TESTS
## If 'x' consists of possible age group labels
## following non-dem formats,
## apply a standard set of transformations
## that are likely to turn the elements of 'x'
## into valid dem formats
clean_age_guess <- function(x, language) {
if (language == "English") {
year <- "year|years|yr|yrs"
infant <- "^infants$|^infant$|^in 1st$|^less than 1$|^under 1$|^less than one$"
plus <- "and over|plus|and above|and older|or more"
num <- c("zero", "one", "two", "three", "four",
"five", "six", "seven", "eight", "nine")
}
else
stop(gettextf("cannot process language \"%s\"",
language),
call. = FALSE)
## test whether 'x' consists of lower limits
## of 5-year or abridged life table age groups
ans <- clean_age_5(x)
if (!is.null(ans))
return(ans)
ans <- clean_age_lifetab(x)
if (!is.null(ans))
return(ans)
## put everything into lower case
x <- tolower(x)
## trim leading zeros from any numbers
x <- gsub("(?<![0-9])0+(?=[0-9])", "", x, perl = TRUE)
## remove "year" labels
x <- sub(year, "", x)
## translate synonyms for age group "0"
x <- sub(infant, "0", x)
## translate synonyms for "+"
x <- sub(plus, "+", x)
## remove spaces
x <- gsub(" ", "", x)
## tranlsate synonyms for "-"
x <- sub("^([0-9]+)to([0-9]+)$", "\\1-\\2", x)
x <- sub("^([0-9]+)[[:punct:]]+([0-9]+)$", "\\1-\\2", x)
## translate numbers
for (i in seq_along(num))
x <- gsub(num[[i]], i - 1L, x)
## return result
x
}
## HAS_TESTS
## If 'x' consists of integer-like values
## where the unique, sorted
## values form a series 0, 1, 5, 10, ..., A
## (A >= 50), assume that these represent
## lifetable age groups "0", "1-4", "5-9",
## "10-14", ..., "A+";
## otherwise return NULL.
clean_age_lifetab <- function(x) {
if (length(x) < 12L)
return(NULL)
is_obs <- !is.na(x)
x_int <- suppressWarnings(as.integer(x))
if (anyNA(x_int[is_obs]))
return(NULL)
if (any(x_int[is_obs] != x_int[is_obs]))
return(NULL)
breaks <- sort(unique(x_int[is_obs]))
n <- length(breaks)
if (n < 12L)
return(NULL)
s <- c(0L,
1L,
seq.int(from = 5L, by = 5L, length.out = n - 2L))
if (!identical(breaks, s))
return(NULL)
labels <- c("0",
"1-4",
paste(breaks[-(1:2)], breaks[-(1:2)] + 4L, sep = "-"))
labels[[n]] <- paste0(breaks[[n]], "+")
i <- match(x_int, breaks)
labels[i]
}
## HAS_TESTS
## If 'x' consists of possible cohort labels
## following non-dem formats,
## apply a standard set of transformations
## that are likely to turn the elements of 'x'
## into valid dem formats
clean_cohort_period_guess <- function(x, language, open_first) {
if (language == "English") {
lessthan <- "less than|up to|before"
quarters <- list(Q1 = "first quarter|quarter ?1|qu ?1|q ?1",
Q2 = "second quarter|quarter ?2|qu ?2|q ?2",
Q3 = "third quarter|quarter ?3|qu ?3|q ?3",
Q4 = "fourth quarter|quarter ?4|qu ?4|q ?4")
months <- list(Jan = "january|jan",
Feb = "february|feb",
Mar = "march|mar",
Apr = "april|apr",
May = "may|may",
Jun = "june|jun",
Jul = "july|jul",
Aug = "august|aug",
Sep = "september|sep",
Oct = "october|oct",
Nov = "november|nov",
Dec = "december|dec")
}
else
stop(gettextf("cannot process language \"%s\"",
language),
call. = FALSE)
## put everything into lower case
x <- tolower(x)
## trim leading zeros from any numbers
x <- gsub("(?<![0-9])0+(?=[0-9])", "", x, perl = TRUE)
## trim leading and trailing spaces
x <- gsub("^ +| +$", "", x)
## translate quarters
for (i in seq_along(quarters)) {
## replace synonym with official quarter name
synonyms <- quarters[[i]]
name <- names(quarters)[[i]]
## reformat where quarter and year in wrong order
pattern <- sprintf("(%s)[- ._]+([0-9]+)$", synonyms)
replacement <- sprintf("\\2 %s", name)
x <- sub(pattern, replacement, x)
## reformat where quarter and year in right order
pattern <- sprintf("([0-9]+)[- ._]*(%s)$", synonyms)
replacement <- sprintf("\\1 %s", name)
x <- sub(pattern, replacement, x)
}
## translate months
for (i in seq_along(months)) {
## replace synonym with official month name
synonyms <- months[[i]]
name <- names(months)[[i]]
## reformat where month and year in wrong order
pattern <- sprintf("(%s)[- ._]+([0-9]+)$", synonyms)
replacement <- sprintf("\\2 %s", name)
x <- sub(pattern, replacement, x)
## reformat where month and year in right order
pattern <- sprintf("([0-9]+)[- ._]*(%s)$", synonyms)
replacement <- sprintf("\\1 %s", name)
x <- sub(pattern, replacement, x)
}
## translate synonyms for "<"
if (open_first) {
p <- sprintf("^(%s) *", lessthan)
x <- sub(p, "<", x)
}
## tranlsate synonyms for "-"
x <- sub("^([0-9]+) *to *([0-9]+)$", "\\1-\\2", x)
x <- sub("^([0-9]+) *[[:punct:]]+ *([0-9]+)$", "\\1-\\2", x)
## return result
x
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.