#' @title Format Locality Information
#'
#' @description Standardize the notation of the locality fields country,
#' stateProvince, municipality and locality, and search from some missing
#' information within the available locality information.
#'
#' @param x a data frame containing typical locality fields from species
#' records.
#' @param loc.levels a vector containing the names of the locality fields to be
#' formatted.
#' @param scrap logical. Should the search of missing locality information be
#' performed? Default to TRUE.
#' @param to.lower logical. Should the output locality names be return in lower
#' cases? Default to TRUE.
#'
#' @return The input data frame \code{x}, plus the '.new' columns with the
#' formatted fields and the resolution of the locality information available.
#'
#' @details The function performs several edits and replacements. Country names
#' are formatted into the international format, letters are lower-cased, and
#' special characters and common abbreviations are removed.
#'
#' By default, this function formats all four locality fields simultaneously
#' (i.e. country, stateProvince, municipality, locality), but the user can
#' choose among these fields through the argument `loc.levels`. However, the
#' process of searching for missing information is more complete if all the
#' four locality fields mentioned above are available.
#'
#' If present, other Darwin Core fields are used internally to obtain missing
#' information on the locality fields declared above, namely: 'countryCode',
#' 'county', and 'verbatimLocality'.
#'
#' The argument `scrap` controls the search for missing municipality
#' information from the field 'locality'. It also performs some extra editing
#' and cropping of the field 'locality' in order to obtain more standardized
#' locality descriptions. This argument uses different ways of splitting and
#' cropping the locality description in order to find missing information.
#' Although it does not always result in an accurate extraction of the
#' information, it provides an extra tool to organize locality information
#' which are not provided in the appropriate columns.
#'
#' The function automatically returns the original resolution of the
#' locality information provided. For instance, if only country information is
#' provided (i.e. field is not empty), then the resolution is flagged as
#' 'country'; if country and stateProvince are given, then the resolution is
#' flagged as 'stateProvince', and so on.
#'
#'
#' @author Renato A. F. de Lima
#'
#' @importFrom stringr str_to_title str_replace_all
#' @importFrom countrycode countrycode
#' @importFrom utils head
#'
#' @export fixLoc
#'
#' @examples
#' # Creating a data frame with locality information
#' (df <- data.frame(country = c("BR", "Brasil", "BRA", "Brazil", NA),
#' stateProvince = c("MG", "estado de Minas Gerais", "Minas Geraes",
#' "Minas Gerais", "Minas Gerais"),
#' municipality = c("Lavras", "lavras", NA, NA, "Lavras"),
#' locality = c(NA, "UFLA", "municipio de Lavras, campus UFLA",
#' "Minas Gerais, municipio Lavras", NA)))
#'
#' # Formating the locality information
#' fixLoc(df, scrap = FALSE)
#' fixLoc(df, scrap = FALSE, to.lower = FALSE)
#'
#' # Formating and scrapping the locality information
#' fixLoc(df, scrap = TRUE)
#'
#' # Formating the locality information only at country and state levels
#' fixLoc(df, loc.levels = c("country", "stateProvince"))[,-c(1:4)]
#'
fixLoc <- function(x,
loc.levels = c("country", "stateProvince", "municipality", "locality"),
scrap = TRUE, to.lower = TRUE) {
## checking input:
if (!class(x)[1] == "data.frame")
stop("input object needs to be a data frame!")
na.strings <- c("", " ", "NA", NA)
# Missing country that may be stored in the field 'countryCode'
if ("countryCode" %in% names(x) & "country" %in% names(x)) {
ids <- !x$countryCode %in% na.strings & x$country %in% na.strings
x$country[ids] <- x$countryCode[ids]
}
# Replacing the names of possibly missing columns
if ("countryCode" %in% names(x) & !"country" %in% names(x))
colnames(x)[which(colnames(x) == "countryCode")] <- "country"
# Missing municipality that may be stored in the field 'county'
if ("municipality" %in% names(x) & "county" %in% names(x)) {
ids <- !x$county %in% na.strings & x$municipality %in% na.strings
x$municipality[ids] <- x$county[ids]
}
if (!"municipality" %in% names(x) & "county" %in% names(x))
colnames(x)[which(colnames(x) == "county")] <- "municipality"
# Missing locality that may be stored in the field 'verbatimLocality'
if ("locality" %in% names(x) & "verbatimLocality" %in% names(x)) {
ids <- !x$verbatimLocality %in% na.strings & x$locality %in% na.strings
x$locality[ids] <- x$verbatimLocality[ids]
}
if (!"locality" %in% names(x) & "verbatimLocality" %in% names(x))
colnames(x)[which(colnames(x) == "verbatimLocality")] <- "locality"
if (!any(c("country", "stateProvince", "municipality", "locality") %in% colnames(x)))
stop("input object needs to have at least one of the following fields: country/countryCode, stateProvince, municipality/county and locality/verbatimLocality")
## Obtaining the intermediary data frame for editing
x1 <- x[, match(loc.levels, colnames(x)), drop = FALSE]
## Loading the dictionary of names, terms, abbreviations and encodign problems to be replaced
enc <- unwantedEncoding
dic <- replaceNames
missLocs <- missLocs
wordsForSearch <- wordsForSearch
## Solving common encoding problems
for (i in 1:length(enc))
x1[] <- lapply(x1, gsub, pattern = names(enc)[i], replacement = enc[i],
ignore.case = TRUE, perl = TRUE)
x1[] <- lapply(x1, gsub, pattern = "\u00AD", replacement = "", perl = TRUE) # soft hyphen
x1[] <- lapply(x1, gsub, pattern = "\u00AO", replacement = "", perl = TRUE) # hidden breaking space
## ADM0: Country level
if (any(c("country","countryCode") %in% loc.levels)) {
# Standardizing country name notation
x1[ ,"country"] <- prepCountry(x1[ ,"country"])
# Replacing missing info by NA
pattern <- paste(missLocs, collapse = "|")
x1[, "country"] <- gsub(pattern, NA, x1[, "country"], perl = TRUE)
x1[, "country"][grepl("desconhecid|unknown", x1[, "country"], perl = TRUE)] <- NA
# Removing brackets from country names
bracks <- grepl('^\\[', x1[, "country"], perl = TRUE) &
grepl('\\]$', x1[, "country"], perl = TRUE)
if (any(bracks))
x1[bracks, "country"] <-
gsub("^\\[|\\]$", "", x1[bracks, "country"], perl = TRUE)
# Replacing variants, abbreviations, typos, and non-standard names
tmp1 <- dic[dic$class %in% "country" & apply(is.na(dic[, 2:4]), 1, all), ]
tmp2 <- tmp1$replace
names(tmp2) <- tmp1$pattern
names(tmp2) <- gsub("\\\\", "", names(tmp2), perl = TRUE)
names(tmp2) <- gsub('\\.', "\\\\.", names(tmp2), perl = TRUE)
names(tmp2) <- gsub('\\(', "\\\\(", names(tmp2), perl = TRUE)
names(tmp2) <- gsub('\\)', "\\\\)", names(tmp2), perl = TRUE)
names(tmp2) <- gsub('\\[', "\\\\[", names(tmp2), perl = TRUE)
names(tmp2) <- gsub('\\]', "\\\\]", names(tmp2), perl = TRUE)
x1[, "country"] <- stringr::str_replace_all(x1[, "country"], tmp2)
# Missing country for non missing states and counties (only for uniquivocal states)
tmp1 <- dic[dic$class %in% "country" & dic$condition2 %in% "not_is.na", ]
reps <- unique(tmp1$replace)
if (all(c("stateProvince", "municipality") %in% names(x1)) & any(reps %in% unique(x1[ ,"country"]))) {
reps1 <- reps[reps %in% unique(x1[, "country"])]
for (i in 1:length(reps1)) {
x1$country[is.na(x1[ ,"country"]) & !is.na(x1[, "municipality"]) &
x1[ ,"stateProvince"] %in% tmp1$condition1[tmp1$replace %in% reps1[i]]] <- reps1[i]
}
}
}
## ADM1: State/Province level ##
if ("stateProvince" %in% loc.levels) {
# Removing unwanted characters
x1[, "stateProvince"] <- tolower(rmLatin(x1[, "stateProvince"]))
# Replacing missing info by NA
pattern <- paste(missLocs, collapse = "|")
x1[ ,"stateProvince"] <- gsub(pattern, NA, x1[ ,"stateProvince"], perl = TRUE)
x1[ ,"stateProvince"][grepl("desconhecid|unknown", x1[, "stateProvince"], perl = TRUE)] <- NA
# Removing unwanted prefixes and abbreviations
pattern <- paste(wordsForSearch, collapse = "|")
x1[ ,"stateProvince"] <- gsub(pattern, "", x1[ ,"stateProvince"], perl = TRUE)
# Replacing variants, abbreviations, typos, and non-standard names
tmp1 <- dic[dic$class %in% "stateProvince" & !is.na(dic[ ,2]),]
tmp2 <- tmp1$replace
names(tmp2) <- tmp1$pattern
names(tmp2) <- gsub('\\.', "\\\\.", names(tmp2), perl = TRUE)
cond0 <- unique(tmp1$condition0)
if ("country" %in% names(x1)) {
if (any(cond0 %in% unique(x1[, "country"]))) {
cond1 <- cond0[cond0 %in% unique(x1[, "country"])]
for (i in 1:length(cond1)) {
tmp2.i <- tmp2[tmp1$condition0 %in% cond1[i]]
x1[x1[, "country"] %in% cond1[i] ,"stateProvince"] <-
stringr::str_replace_all(x1[x1[,"country"] %in% cond1[i] ,"stateProvince"], tmp2.i)
}
}
}
}
## ADM2: County, Departament, Commune
if ("municipality" %in% loc.levels) {
# Removing unwanted characters and replacing missing info by NA
x1[, "municipality"] <- tolower(rmLatin(x1[, "municipality"]))
# Replacing missing info by NA
pattern <- paste(missLocs, collapse = "|")
x1[ ,"municipality"] <- gsub(pattern, NA, x1[ ,"municipality"], perl = TRUE)
x1[ ,"municipality"][grepl("desconhecid|unknown", x1[, "municipality"], perl = TRUE)] <- NA
# Removing unwanted prefixes and abbreviations
tmp1 <- dic[dic$class %in% "county" & apply(is.na(dic[,2:4]), 1, all),]
tmp2 <- tmp1$replace
names(tmp2) <- tmp1$pattern
names(tmp2) <- gsub('\\.', "\\\\.", names(tmp2), perl = TRUE)
x1[ ,"municipality"] <- stringr::str_replace_all(x1[ ,"municipality"], tmp2)
# Some extra removals
tmp1 <- dic[dic$class %in% "locality1" & apply(is.na(dic[,2:4]), 1, all),]
tmp2 <- rep("", dim(tmp1)[1])
names(tmp2) <- tmp1$pattern
names(tmp2) <- gsub('\\.', "\\\\.", names(tmp2), perl = TRUE)
names(tmp2) <- gsub('\\(', "\\\\(", names(tmp2), perl = TRUE)
x1[ ,"municipality"] <- stringr::str_replace_all(x1[ ,"municipality"], tmp2)
}
## ADM3: locality (park, farm, etc.)
if (any(c("locality") %in% loc.levels)) {
# Removing unwanted characters
x1[, "locality"] <- tolower(rmLatin(x1[, "locality"]))
# Removing unwanted prefixes and abbreviations (1st round)
tmp1 <- dic[dic$class %in% "locality1" & apply(is.na(dic[,2:4]), 1, all),]
tmp2 <- tmp1$replace
names(tmp2) <- tmp1$pattern
names(tmp2) <- gsub('\\.', "\\\\.", names(tmp2), perl = TRUE)
names(tmp2) <- gsub('\\(', "\\\\(", names(tmp2), perl = TRUE)
names(tmp2) <- gsub('\\)', "\\\\)", names(tmp2), perl = TRUE)
x1[ ,"locality"] <- stringr::str_replace_all(x1[ ,"locality"], tmp2)
# solving some substitution problems
x1[ ,"locality"] <- gsub(" de de | de of ", " de ", x1[ ,"locality"], perl = TRUE)
x1[ ,"locality"] <- gsub(" de do ", " do ", x1[ ,"locality"], perl = TRUE)
# Removing unwanted prefixes and abbreviations (2nd round)
tmp1 <- dic[dic$class %in% "locality2" & apply(is.na(dic[,2:4]), 1, all),]
tmp2 <- tmp1$replace
names(tmp2) <- tmp1$pattern
names(tmp2) <- gsub('\\.', "\\\\.", names(tmp2), perl = TRUE)
names(tmp2) <- gsub('\\(', "\\\\(", names(tmp2), perl = TRUE)
names(tmp2) <- gsub('\\)', "\\\\)", names(tmp2), perl = TRUE)
x1[ ,"locality"] <- stringr::str_replace_all(x1[ ,"locality"], tmp2)
}
if (c("locality") %in% loc.levels & scrap) {
# Spliting the locality vector to find missing information
n4 <- strsplit(x1[ ,"locality"], ",|\\.|:|\\(|\\)|;", perl = TRUE)
n4 <- sapply(n4, function(x) gsub("^ | $", "", x, perl = TRUE))
# trying to get missing counties from the locality description (e.g. "Pico das Almas, municipio de Rio de Contas")
n4.2 <- as.character(sapply(n4, function(x)
paste(unique(gsub("^ | $", "",
x[grepl("municipio|municipality|county|provincia|village", x, perl = TRUE)], perl = TRUE
)), collapse = "|")))
n4.2 <- gsub("^ | $", "", n4.2, perl = TRUE)
n4.2 <- gsub("municipio de |municipality of |municipio do |^county of |^provincia de|^provincia of|village of",
"",
n4.2, perl = TRUE)
n4.2 <- gsub("municipio |municipality |^county |^provincia |village ",
"",
n4.2, perl = TRUE)
n4.2[n4.2 %in% ""] <- NA
# getting missing counties that may be the first part of the locality description (e.g. "Rio de Contas, Pico das Almas")
n4.2.1 <- as.character(sapply(n4, function(x) x[1]))
n4.2.1 <- gsub("^ | $", "", n4.2.1, perl = TRUE)
n4.2.1[n4.2.1 %in% ""] <- NA
# isolating localities possibily in the gazetter (e.g. parks, serras, farms)
locais <- "parque|reserva|reserve|fazenda|nacional|estadual|parna|flona|rebio|rppn|e\\.e\\.|biologica|ecologica|extrativista|park|farm|estrada|rodovia|road|^sitio|^mata|^horto|^jardim|campus|^pico|^serra|^sierra|^morro|^chapada"
n4.3 <- as.character(sapply(n4, function(x)
paste(unique(gsub("^ | $", "",
x[grepl(locais, x, perl = TRUE)], perl = TRUE
)), collapse = ", ")))
n4.3 <- gsub("^ | $", "", n4.3, perl = TRUE)
n4.3[n4.3 %in% ""] <- NA
#other localities, than the ones in 'locais' and counties (first sentence, prior to the first comma when n4.3 is empty)
n4.3[is.na(n4.3) & !is.na(x1[ ,"locality"])] <-
as.character(sapply(n4[is.na(n4.3) & !is.na(x1[ ,"locality"])],
function(x) utils::head(x[!grepl("municipio|municipality|county|provincia|village", x, perl = TRUE)], n = 1)))
# function(x) my.head(x[!grepl("municipio|municipality|county|provincia|village", x, perl = TRUE)])))
# Replacing missing counties
if (any(c("municipality", "county") %in% loc.levels)) {
# priority 1: localities specifying a county name
x1[, "municipality"][is.na(x1[, "municipality"]) & !is.na(n4.2)] <-
gsub("^ | $", "", n4.2[is.na(x1[, "municipality"]) & !is.na(n4.2)], perl = TRUE)
# priority 2: first part of the locality description
x1[, "municipality"][is.na(x1[, "municipality"]) & is.na(n4.2)] <-
gsub("^ | $", "", n4.2.1[is.na(x1[, "municipality"]) & is.na(n4.2)], perl = TRUE)
}
# Replacing edited/missing localities
x1[ ,"locality"][!is.na(n4.3)] <- n4.3[!is.na(n4.3)]
x1[ ,"locality"][is.na(n4.3)] <-
as.character(sapply(n4[is.na(n4.3)], function(x) x[1]))
}
## Trimming and editing the edited columns
for (i in 1:length(loc.levels))
x1[, i] <- as.character(gsub("^ | $", "", x1[, i], perl = TRUE))
for (i in 1:length(loc.levels))
x1[, i] <- gsub("^-$", NA, x1[, i], perl = TRUE)
## Resolution of the locality information provided
tmp <- apply(x1[, loc.levels, drop = FALSE], 1, function(x)
which(is.na(x))[1] - 1)
tmp[tmp %in% 0] <- length(loc.levels) + 1
tmp[is.na(tmp)] <- length(loc.levels)
resol.orig <- c(loc.levels, "no_info")[tmp]
## Preparing the output
if (length(loc.levels) == 1) {
res <- as.vector(x1[,1])
if (!to.lower)
res <- stringr::str_to_title(res)
} else {
names(x1) <- paste0(names(x1), ".new")
res <- as.data.frame(x1)
if (c("locality.new") %in% names(x1) & scrap)
res$locality.scrap <- n4.2.1
if (!to.lower) {
names.res <- names(res)
for(i in 1:length(names.res))
res[, names.res[i]] <- stringr::str_to_title(res[, names.res[i]])
}
res$resol.orig <- resol.orig
}
## Merging the results and returning
result <- cbind.data.frame(x, res, stringsAsFactors = FALSE)
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.