#' @title Format People's Name
#'
#' @description Convert people's names to different formats (i.e. Last Name,
#' First Names(s) or First Name(s), Last Name) with or without last name
#' prepositions. The default name format is the one suggested in the
#' \href{https://www.tdwg.org/}{Biodiversity Information Standards} (TDWG)
#' format.
#'
#' @param x the character string or vector containing the names.
#' @param sep character. Input and output name separator. Default to ", ".
#' @param format character. Output name format. The default is "last_init".
#' @param pretty logical. Should the output name be returned in a pretty
#' presentation (i.e. only the first letter of names capitalized, initials
#' separated by points and no spaces, and family name prepositions in lower
#' cases). Default to TRUE. If FALSE, names are returned in the same way as
#' the input object \code{x}.
#' @param get.prep logical. Should last name prepositions be included? Default
#' to FALSE.
#' @param get.initials logical. Should the first name(s) be abbreviated?
#' Default to TRUE.
#' @param max.initials numerical. Upper limit of number of letter for a single
#' word to be considered as initials and not as a first name. Default to 4.
#'
#' @return The character string \code{x} in the standardized format.
#'
#' @details The default name format follows the one suggested by the
#' \href{https://www.tdwg.org/}{TDWG}, which is: Last name, followed by a
#' comma and then the initials, separated by points (e.g. Hatschbach, G.G.).
#'
#' The functions uses internally another __plantR__ function: `lastName()`.
#' So, it assumes that people last names are the ones provided at the end of
#' the name string or preceding the name separator (i.e. comma), if present.
#'
#' The function deals with simples last names, as well as with compound last
#' names and last names with common name prefixes or prepositions (e.g. de,
#' dos, van, ter, ...). By default, these prefixes and prepositions are
#' removed, but they can be returned if the argument `get.prep` is set to
#' TRUE.
#'
#' The function assumes that all names containing separators (default to a
#' comma) are in the format suggested by TDWG. But even for those cases, the
#' function fixes simple problems (e.g. missing points between name initials).
#'
#' If only one name is given, the function return the same name with the first
#' letter capitalized.
#'
#' The function output it is relatively stable regarding the input format,
#' lower/uppercasing and spacing. But if the name provided has unusual
#' formatting or if names for multiple people are provided within the same
#' string, the function may not work properly. So, the output may depend on
#' the input format and some level of double-checking may be necessary. See
#' examples below.
#'
#' @author Renato A. F. de Lima
#'
#' @references
#'
#' Conn, Barry J. (ed.) (1996). HISPID 3 - Herbarium Information Standards and
#' Protocols for Interchange of Data. Herbarium Information Systems Committee'
#' (HISCOM). https://www.tdwg.org/standards/hispid3/
#'
#' Willemse, L.P., van Welzen, P.C. & Mols, J.B. (2008).
#' Standardisation in data-entry across databases: Avoiding Babylonian
#' confusion. Taxon 57(2): 343-345.
#'
#' @seealso
#' \link[plantR]{lastName}, \link[plantR]{getPrep} and \link[plantR]{getInit}.
#'
#' @export prepTDWG
#'
#' @examples
#' # Single names
#' prepTDWG("gentry")
#' prepTDWG("GENTRY")
#'
#' # Simple names
#' prepTDWG("Alwyn Howard Gentry")
#' prepTDWG("Alwyn H. Gentry")
#' prepTDWG("A.H. Gentry")
#' prepTDWG("A H Gentry")
#' prepTDWG("Gentry, Alwyn Howard")
#' prepTDWG("Gentry, AH")
#' prepTDWG("Gentry AH")
#' prepTDWG("GENTRY, A H")
#' prepTDWG("gentry, alwyn howard")
#' prepTDWG("gentry, a.h.")
#' prepTDWG("gentry, a. h.")
#'
#' # Name with prepositions
#' prepTDWG("Carl F. P. von Martius")
#' prepTDWG("Carl F. P. von Martius", get.prep = TRUE)
#'
#' # Names with generational suffixes
#' prepTDWG("Hermogenes de Freitas Leitao Filho")
#' prepTDWG("H.F. Leitao Filho")
#' prepTDWG("Leitao Filho, HF")
#' prepTDWG("Leitao filho, H. F.")
#'
#' # Compound last name
#' prepTDWG("Augustin Saint-Hilaire")
#' prepTDWG("A. Saint-Hilaire")
#' prepTDWG("Saint-Hilaire, Augustin")
#'
#' # Other formats
#' prepTDWG("John MacDonald")
#' prepTDWG("John McDonald")
#' prepTDWG("John O'Brien")
#'
#' # Multiple names, different settings
#' names <- c("Gentry, AH", "Gentry A.H.",
#' "Carl F. P. von Martius","Leitao filho, H. de F.",
#' "Auguste de Saint-Hilaire", "John O'Reilly")
#' prepTDWG(names)
#' prepTDWG(names, format = "init_last")
#' prepTDWG(names, format = "init_last", get.prep = TRUE)
#' prepTDWG(names, get.prep = TRUE, format = "prep_last_init")
#' prepTDWG(names, get.prep = TRUE, format = "prep_last_init",
#' get.initials = FALSE)
#' prepTDWG(names, get.prep = TRUE, pretty = FALSE,
#' get.initials = FALSE)
#'
#' ## Unusual formatting (function won't work always...)
#' # two or more people names: output incorrect (combine names of authors)
#' prepTDWG("C. Mendonca Filho; F. da Silva")
#' # two or more names, separated by comma: output incorrect (combine names of authors)
#' prepTDWG("A. Alvarez, A. Zamora & V. Huaraca")
#' # one name, two commas: fails to get all names
#' prepTDWG("Cesar Sandro, Esteves, F")
#' #' one name, abbreviations in the start and end: fails to get all names
#' prepTDWG("C.S. Esteves F.")
#'
prepTDWG <- function(x,
sep = ", ",
format = "last_init",
pretty = TRUE,
get.prep = FALSE,
get.initials = TRUE,
max.initials = 4) {
# Detecting different name formats
patt <- paste0("\\p{L}", sep, "\\p{L}")
# patt <- paste0("\\p{Ll}", sep, "\\p{Lu}")
commas <- grepl(patt, x, perl = TRUE)
NAs <- x %in% c("", " ", NA)
words <- grepl(" ", x, fixed = TRUE)
# Detecting and fixing other name formats
oa <- grepl("O'[A-Z]|O' [A-Z]", x, perl = TRUE)
if (any(oa))
x[oa] <- gsub("(O') ([A-Z])", "\\1\\2", x[oa], perl = TRUE)
mac <- grepl('Mac[A-Z]|Mac [A-Z]', x, perl = TRUE)
if (any(mac))
x[mac] <- gsub("(Mac) ([A-Z])", "\\1\\2", x[mac], perl = TRUE)
mc <- grepl('Mc[A-Z]|Mc [A-Z]', x, perl = TRUE)
if (any(mc))
x[mc] <- gsub("(Mc) ([A-Z])", "\\1\\2", x[mc], perl = TRUE)
nomes <- x
if (any(commas & words & !NAs)) { # Names already with commas and >1 word
# Getting last/first names, as well as prefixes/prepositions and initials
split <- matrix(c(lastName(x[commas & words & !NAs],
first.capital = FALSE),
lastName(x[commas & words & !NAs],
invert = TRUE, initials = FALSE,
first.capital = FALSE)),
ncol = 2)
split <- getPrep(split, rm.prep = FALSE)
if (get.initials) {
# Initials containing (collection) codes in parentheses
remove_these <-
grepl(" \\([A-Z]", split[, 2]) & grepl("[A-Z]\\)", split[, 2])
if (any(remove_these))
split[, 2][remove_these] <-
gsub(" \\([A-Z].*", "", split[, 2][remove_these], perl = TRUE)
# Trying to isolate initials w/out periods that are equal to prepositions
check_caps <- split[, 1] == toupper(split[, 1]) &
split[, 2] == toupper(split[, 2]) &
!split[, 2] %in% ""
if (any(check_caps)) {
split[, 2][check_caps] <-
gsub("(*UCP)[^;\\&\\-\\\\'\\s](?<!\\b\\p{L})", "",
split[, 2][check_caps], perl = TRUE)
split[, 2][check_caps] <-
gsub("(\\p{L})", "\\1.", split[, 2][check_caps], perl = TRUE)
split[, 2][check_caps] <-
gsub("\\.,\\.|\\.\\.", ".", split[, 2][check_caps], perl = TRUE)
}
split[, 2][!check_caps] <-
getInit(split[, 2][!check_caps], max.initials = 10,
rm.spaces = pretty)
} else {
# split[, 2] <- stringr::str_squish(split[, 2])
split[, 2] <- gsub("\\s+", " ", split[, 2], perl = TRUE)
split[, 2] <- gsub("^ | $", "", split[, 2], perl = TRUE)
}
if (pretty) {
# first/middle names of initials
if (get.initials) {
split[, 2] <- gsub(" ", "", split[, 2], fixed = TRUE)
} else {
# Detecting the first possible conditions
not_empty <- !split[,2] %in% ""
init_caps <- split[,2] == toupper(split[,2])
is_inits <- nchar(gsub("(*UCP)[^;](?<!\\p{L})",
"", split[, 2], perl=TRUE)) < max.initials
# Adding periods to initials
check_inits <- init_caps & is_inits & not_empty
split[, 2][check_inits] <-
gsub("(\\p{Lu})(\\s)(\\p{Lu})+", "\\1.\\2\\3.",
split[, 2][check_inits], perl = TRUE)
split[, 2][check_inits] <-
gsub("(\\p{Lu})(\\p{Lu})+", "\\1.\\2.",
split[, 2][check_inits], perl = TRUE)
split[, 2][check_inits] <-
gsub("(\\p{Lu})(\\.\\s)(\\p{Lu}$)", "\\1\\2\\3.",
split[, 2][check_inits], perl = TRUE)
split[, 2][check_inits] <-
gsub("(^\\p{Lu}$)", "\\1.",
split[, 2][check_inits], perl = TRUE)
# Detecting the second possible conditions
w_abbrev <- grepl("\\.", split[, 2], perl = TRUE)
init_low <- split[,2] == tolower(split[,2])
#Checking all caps/lower without abbreviation
check_these <- !w_abbrev & (init_caps | init_low) & not_empty
if (any(check_these))
split[, 2][check_these] <- capName(split[, 2][check_these])
# Any name initials not separated by commas or spaces? (Muted for now)
# w_spaces <- grepl(" ", split[, 2], fixed = TRUE)
# not_all_caps <- !split[,1] == toupper(split[,1]) & init_caps
# check_caps <- !w_abbrev & !w_spaces & init_caps &
# not_all_caps & not_empty
# if (any(check_caps))
# split[, 2][check_caps] <- gsub("([A-Z])", "\\1.",
# split[, 2][check_caps], perl = TRUE)
#Removing spaces between initials
split[, 2] <- gsub("(\\.) (\\p{L}\\.)", "\\1\\2",
split[, 2], perl = TRUE)
split[, 2] <- gsub("(\\.) (\\p{L}\\.)", "\\1\\2",
split[, 2], perl = TRUE)
#Final edits (lowr case initials)
split[, 2][init_low] <- gsub("(\\p{Ll})(\\.)+",
"\\U\\1\\2",
split[, 2][init_low], perl = TRUE)
}
# family name
split[, 1] <- capName(split[, 1])
# split[, 1] <- gsub("(^\\p{Lu})(\\s\\p{Lu})", "\\1.\\2", # muted for now
# split[, 1], perl = TRUE)
# family name prepositions
split[, 3] <- tolower(split[, 3])
}
# Generating the name string
if (get.prep) {
if (format %in% c("last_init", "last_init_prep"))
names <- paste(paste(split[, 1], split[, 2], sep = sep), split[, 3])
if (format %in% c("prep_last_init"))
names <- paste(split[, 3], paste(split[, 1], split[, 2], sep = sep))
if (format %in% c("init_last"))
names <- paste(split[, 2], split[, 3], split[, 1])
} else {
if (format %in% c("last_init", "last_init_prep", "prep_last_init"))
names <- paste(split[, 1], split[, 2], sep = sep)
if (format %in% c("init_last"))
names <- paste(split[, 2], split[, 1])
}
# Final edits on the name string
# names <- stringr::str_squish(names)
names <- gsub("\\s+", " ", names, perl = TRUE)
names <- gsub("^ | $", "", names, perl = TRUE)
names <- gsub("\\.\\s\\.", ".", names, perl = TRUE)
names <- gsub(",$", "", names, perl = TRUE)
nomes[commas & words & !NAs] <- names
}
if (any(!commas & words & !NAs)) { # Names without commas and >1 word
# Names ending with (collection) codes in parentheses
remove_these <-
grepl(" \\([A-Z]", x[!commas & words & !NAs]) &
grepl("[A-Z]\\)$", x[!commas & words & !NAs])
if (any(remove_these))
x[!commas & words & !NAs][remove_these] <-
gsub(" \\([A-Z].*", "",
x[!commas & words & !NAs][remove_these], perl = TRUE)
# Getting last/first names, as well as prefixes/prepositions and initials
split <- matrix(c(lastName(x[!commas & words & !NAs],
first.capital = FALSE),
lastName(x[!commas & words & !NAs],
invert = TRUE, initials = FALSE,
first.capital = FALSE)),
ncol = 2)
split <- getPrep(split, rm.prep = FALSE)
if (get.initials) {
# Trying to isolate initials w/out periods that are equal to prepositions
check_caps <- split[, 1] == toupper(split[, 1]) &
split[, 2] == toupper(split[, 2]) &
!split[, 2] %in% ""
if (any(check_caps)) {
split[, 2][check_caps] <-
gsub("(*UCP)[^;\\&\\-\\\\'\\s](?<!\\b\\p{L})", "",
split[, 2][check_caps], perl = TRUE)
split[, 2][check_caps] <-
gsub("(\\p{L})", "\\1.", split[, 2][check_caps], perl = TRUE)
split[, 2][check_caps] <-
gsub("\\.,\\.|\\.\\.", ".", split[, 2][check_caps], perl = TRUE)
}
split[, 2][!check_caps] <-
getInit(split[, 2][!check_caps], max.initials = 10,
rm.spaces = pretty)
split[, 2][split[, 2] %in% "character(0)."] <- ""
} else {
# split[, 2] <- stringr::str_squish(split[, 2])
split[, 2] <- gsub("\\s+", " ", split[, 2], perl = TRUE)
split[, 2] <- gsub("^ | $", "", split[, 2], perl = TRUE)
}
if (pretty) {
# first/middle names of initials
if (get.initials) {
split[, 2] <- gsub(" ", "", split[, 2], fixed = TRUE)
} else {
# Detecting the first possible conditions
not_empty <- !split[,2] %in% ""
init_caps <- split[,2] == toupper(split[,2])
is_inits <- nchar(gsub("(*UCP)[^;](?<!\\p{L})",
"", split[, 2], perl=TRUE)) < max.initials
# Adding periods to initials
check_inits <- init_caps & is_inits & not_empty
split[, 2][check_inits] <-
gsub("(\\p{Lu})(\\s)(\\p{Lu})+", "\\1.\\2\\3.",
split[, 2][check_inits], perl = TRUE)
split[, 2][check_inits] <-
gsub("(\\p{Lu})(\\p{Lu})+", "\\1.\\2.",
split[, 2][check_inits], perl = TRUE)
split[, 2][check_inits] <-
gsub("(\\p{Lu})(\\.\\s)(\\p{Lu}$)", "\\1\\2\\3.",
split[, 2][check_inits], perl = TRUE)
split[, 2][check_inits] <-
gsub("(^\\p{Lu}$)", "\\1.",
split[, 2][check_inits], perl = TRUE)
# Detecting the second possible conditions
w_abbrev <- grepl("\\.", split[, 2], perl = TRUE)
init_low <- split[,2] == tolower(split[,2])
#Checking all caps/lower without abbreviation
check_these <- !w_abbrev & (init_caps | init_low) & not_empty
if (any(check_these))
split[, 2][check_these] <- capName(split[, 2][check_these])
# Any name initials not separated by commas or spaces? (Muted for now)
# w_spaces <- grepl(" ", split[, 2], fixed = TRUE)
# not_all_caps <- !split[,1] == toupper(split[,1]) & init_caps
# check_caps <- !w_abbrev & !w_spaces & init_caps &
# not_all_caps & not_empty
# if (any(check_caps))
# split[, 2][check_caps] <- gsub("([A-Z])", "\\1.",
# split[, 2][check_caps], perl = TRUE)
#Removing spaces between initials
split[, 2] <- gsub("(\\.) (\\p{L}\\.)", "\\1\\2",
split[, 2], perl = TRUE)
split[, 2] <- gsub("(\\.) (\\p{L}\\.)", "\\1\\2",
split[, 2], perl = TRUE)
#Final edits (lowr case initials)
split[, 2][init_low] <- gsub("(\\p{Ll})(\\.)+",
"\\U\\1\\2",
split[, 2][init_low], perl = TRUE)
}
# family name
split[, 1] <- capName(split[, 1])
# split[, 1] <- gsub("(^\\p{Lu})(\\s\\p{Lu})", "\\1.\\2", # muted for now
# split[, 1], perl = TRUE)
# family name prepositions
split[, 3] <- tolower(split[, 3])
}
# Generating the name string
if (get.prep) {
if(format %in% c("last_init", "last_init_prep"))
names <- paste(paste(split[, 1], split[, 2], sep = sep), split[, 3])
if(format %in% c("prep_last_init"))
names <- paste(split[, 3], paste(split[, 1], split[, 2], sep = sep))
if(format %in% c("init_last"))
names <- paste(split[, 2], split[, 3], split[, 1])
} else {
if(format %in% c("last_init", "last_init_prep", "prep_last_init"))
names <- paste(split[, 1], split[, 2], sep = sep)
if(format %in% c("init_last"))
names <- paste(split[, 2], split[, 1])
}
# Final edits on the name string
# names <- stringr::str_squish(names)
names <- gsub("\\s+", " ", names, perl = TRUE)
names <- gsub("^ | $", "", names, perl = TRUE)
names <- gsub(",$", "", names, perl = TRUE)
names <- gsub("\\.\\s\\.", ".", names, perl = TRUE)
nomes[!commas & words & !NAs] <- names
}
if (any(!words & !NAs)) # Names with a single word
nomes[!words & !NAs] <- capName(nomes[!words & !NAs])
if (any(NAs)) # NAs, "" or " "
nomes[NAs] <- NA_character_
#Fixing names in other formats, if present
if (any(oa))
nomes[oa] <- gsub("(O')([a-z])", "\\1\\U\\2", nomes[oa], perl = TRUE)
if (any(mac))
nomes[mac] <- gsub("(Mac)([a-z])", "\\1\\U\\2", nomes[mac], perl = TRUE)
if (any(mc))
nomes[mc] <- gsub("(Mc)([a-z])", "\\1\\U\\2", nomes[mc], perl = TRUE)
return(as.character(nomes))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.