#' Load Locations Dataset
#'
#' @param file The path of the file to be read.
#' @param id Name of the column with a unique identifier (renamed to "id"). If \code{NULL}, locations are assigned an integer id corresponding to their order of appearance in the file.
#' @param xy (delimited text files only) Names of the columns with x and y coordinates (renamed to "lng", "lat" respectively).
#' @param proj4 Current coordinate reference system expressed as a proj.4 string (\url{http://proj4.org/parameters.html}) or EPSG integer code (\url{http://spatialreference.org/ref/epsg/}). If specified, overrides any embedded value (spatial data). If \code{NULL}, the embedded value is used (spatial data) or WGS 84 (EPSG:4326) is assumed if missing.
#' @param format File format to read \code{file} as: "kml", "dbf", "delim" (delimited text), or "ogr" (OGR supported format). If \code{NULL}, the function attempts to guess the format.
#' @param ... Additional parameters passed to \code{\link[data.table]{fread}} (delimited text files), \code{\link[rgdal]{readOGR}} (spatial data), or \code{\link[xml2]{read_xml}} (certain kml files).
#' @export
#' @family location import functions
read_locations <- function(file, id = NULL, xy = c("lng", "lat"), proj4 = NULL, format = NULL, ...) {
# Prepare arguments
file <- tools::file_path_as_absolute(file)
if (is.null(proj4)) {
proj4 <- "+init=epsg:4326"
}
to_proj4 <- "+init=epsg:4326"
if (is.null(format)) {
has_layers <- suppressWarnings(try(rgdal::ogrListLayers(file), silent = TRUE)) %>%
{class(.) != "try-error"}
has_features <- suppressWarnings(try(rgdal::ogrInfo(file)$have_features, silent = TRUE)) %>%
{class(.) != "try-error" && .}
is_ogr <- has_layers && has_features
}
# Read kml
read_kml <- function(file, ...) {
xml <- xml2::read_xml(file, ...)
# FIXME: Only retains points
placemarks <- xml %>%
xml2::xml_find_all(xpath = "//*[local-name() = 'Placemark'][*[local-name() = 'Point']]") %>%
lapply(xml2::as_list)
name <- placemarks %>%
sapply(function(p) if (is.null(p$name)) NA else p$name) %>%
unlist()
description <- placemarks %>%
sapply(function(p) if (is.null(p$description)) NA else p$description) %>%
unlist()
coordinates <- placemarks %>%
sapply(function(p) p$Point$coordinates) %>%
unlist() %>%
stringr::str_extract_all(pattern = "([-0-9\\.]+)") %>%
sapply(as.numeric) %>%
t()
tryCatch(
coordinates %<>% sp_transform(from = proj4, to = to_proj4),
error = function (e) warning(paste0("Spatial transformation failed:\n", e))
)
data.table::data.table(name, description, lng = coordinates[, 1], lat = coordinates[, 2], stringsAsFactors = FALSE)
}
# Read ogr
read_ogr <- function(file, ...) {
layers <- rgdal::ogrListLayers(file)
read_layer <- function(layer, ...) {
shp <- rgdal::readOGR(file, layer, stringsAsFactors = FALSE, ...)
if (!is.na(sp::proj4string(shp))) {
tryCatch(
shp %<>% sp_transform(from = shp@proj4string, to = to_proj4),
error = function (e) warning(paste0("Spatial transformation failed:\n", e))
)
}
# FIXME: Only retains points
if (methods::.hasSlot(shp, "coords")) {
df <- shp@data
df$lng <- shp@coords[, 1]
df$lat <- shp@coords[, 2]
cbind(layer, df, stringsAsFactors = FALSE)
} else {
NULL
}
}
layers %>%
lapply(read_layer, ...) %>%
Reduce(rbind, .) %>%
data.table::as.data.table()
}
# Read dbf
read_dbf <- function(file, ...) {
df <- foreign::read.dbf(file, as.is = TRUE, ...)
tryCatch(
df %<>% sp_transform(from = proj4, to = to_proj4, cols = xy),
error = function (e) warning(paste0("Spatial transformation failed:\n", e))
)
df %>%
data.table::as.data.table()
}
# Read delim
read_delim <- function(file, ...) {
df <- data.table::fread(file, stringsAsFactors = FALSE, data.table = FALSE, ...)
tryCatch(
df %<>% sp_transform(from = proj4, to = to_proj4, cols = xy),
error = function (e) warning(paste0("Spatial transformation failed:\n", e))
)
df %>%
data.table::as.data.table()
}
# Read file
if (is.null(format)) {
format <- switch(
tolower(tools::file_ext(file)),
dbf = "dbf",
kml = "kml",
if (is_ogr) {
"ogr"
} else {
"delim"
})
}
dt <- switch(
tolower(format),
dbf = read_dbf(file, ...),
kml = read_kml(file, ...),
ogr = read_ogr(file, ...),
delim = read_delim(file, ...)
)
# Standardize coordinate fields
if (all(!is.empty(xy), xy %in% names(dt))) {
data.table::setnames(dt, xy, c("lng", "lat"))
} else {
warning('No coordinates available!')
}
# Standardize id field
if (all(!is.empty(id), id %in% names(dt))) {
data.table::setnames(dt, id, "id")
} else {
dt[, id := 1:nrow(dt)]
}
if (any(duplicated(dt$id))) {
warning("ID field contains duplicates.")
}
dt[, id := as.character(id)]
data.table::setkey(dt, id)
# return
dt[]
}
#' Match Names Against Falling Fruit Types
#'
#' @param names Names as a list of named vectors (or table columns). Use "scientific" for scientific names, and locale for common names (e.g. \code{list(scientific = c("malus pumila"), en = c("apple"))}).
#' @param ids Unique identifier for each row in \code{names}. If \code{NULL}, row indices are used.
#' @param simplify Method used to merge matches for each row in \code{names}. Default is \code{first}. If \code{NULL} or \code{FALSE}, all results are returned.
#' @param types Falling Fruit types, as returned by \code{\link{get_ff_types}}.
#' @param max_distance Maximum distance for fuzzy matching, as calculated by \code{\link[stringdist]{stringdistmatrix}}.
#' @param n_nearest Maximum number of nearest fuzzy matches to return.
#' @param ... Additional arguments passed to \code{\link[stringdist]{stringdistmatrix}}.
#' @export
#' @family location import functions
match_names_to_ff_types <- function(names, ids = NULL, simplify = c("first", "last", "union", "intersection"), types = get_ff_types(pending = FALSE), max_distance = 3, n_nearest = Inf, ...) {
# Check names
names <- names[!is.empty(names)]
supported_locales <- c("scientific", "cultivar", gsub("common_names\\.", "", names(types)[grepl("^common_names\\.[a-z_]+$", names(types))]))
names <- names[intersect(names(names), supported_locales)]
n_rows <- unique(sapply(names, length))
if (length(n_rows) != 1 | n_rows == 0) {
stop(paste0("Names must be a list of non-empty vectors of equal length from supported locales (", paste(supported_locales, collapse = ", "), ")"))
}
# Check ids
if (is.null(ids)) {
ids <- seq_len(n_rows)
}
# For each locale...
locales <- names(names)
name_groups <- cbind(id = ids, data.table::as.data.table(names))[, .(id = list(id)), by = locales]
match_tables <- lapply(locales, function(locale) {
# Prepare given names
given_names <- name_groups[!is.empty(name_groups[[locale]]), .(id = list(unlist(id))), by = locale]
data.table::setnames(given_names, locale, "name")
# Prepare corresponding type names
types_name_field <- switch(locale,
scientific = "matched_scientific_names",
cultivar = "matched_cultivars",
paste("common_names", locale, sep = ".")
)
type_names <- types[!is.empty(types[[types_name_field]])][, .(name = unlist(.SD)), by = id, .SDcols = types_name_field]
# Compute distance matrix
distance_matrix <- stringdist::stringdistmatrix(tolower(given_names$name), tolower(type_names$name))
# Build match results
matches <- lapply(seq_len(nrow(given_names)), function(i) {
distances <- distance_matrix[i, ]
is_real <- !is.na(distances)
is_exact <- is_real & distances == 0
is_fuzzy <- is_real & distances > 0 & distances < max_distance
is_fuzzy_sorted <- which(is_fuzzy)[order(distances[is_fuzzy])]
exact_types <- type_names$id[is_exact]
fuzzy_types <- type_names$id[is_fuzzy_sorted]
fuzzy_distances <- distances[is_fuzzy_sorted]
n_fuzzy <- min(n_nearest, length(fuzzy_types))
list(
exact = list(exact_types),
fuzzy = list(utils::head(fuzzy_types, n_fuzzy)),
distance = list(utils::head(fuzzy_distances, n_fuzzy))
)
})
# Compile match results
return(cbind(given_names, data.table::rbindlist(matches)))
})
# Return all matches
if (is.null(simplify) || (is.logical(simplify) && !simplify)) {
match_tables <- lapply(seq_along(match_tables), function(i) {
match_tables[[i]][, locale := locales[i]]
})
return(data.table::rbindlist(match_tables))
# Return simplified matches
} else {
merged <- data.table::rbindlist(match_tables)
expanded <- melt_by_listcol(merged, "id")
# NOTE: Strange CHARSXP errors appearing on matches, using ifelse() to avoid
matches <- switch(simplify[1],
first = expanded[, .(
exact = ifelse(all(is.empty(exact)), list(integer()), exact[!is.empty(exact)][1]),
fuzzy = ifelse(all(is.empty(fuzzy)), list(integer()), fuzzy[!is.empty(fuzzy)][1])), by = id],
last = expanded[, .(
exact = ifelse(all(is.empty(exact)), list(integer()), utils::tail(exact[!is.empty(exact)], 1)),
fuzzy = ifelse(all(is.empty(fuzzy)), list(integer()), utils::tail(fuzzy[!is.empty(fuzzy)], 1))), by = id],
union = expanded[, .(exact = list(Reduce(union, exact)), fuzzy = list(Reduce(union, fuzzy))), by = id],
intersection = expanded[, .(exact = list(Reduce(intersect, exact)), fuzzy = list(Reduce(intersect, fuzzy))), by = id],
stop(paste("Unsupported simplify:", simplify[1]))
)
return(matches[, .(exact, fuzzy = ifelse(all(fuzzy %in% exact), list(integer()), fuzzy[!fuzzy %in% exact])), by = id])
}
}
#' Built Match Table from Name Matches
#'
#' @param dt Locations data.
#' @param matches Result of \code{\link{match_names_to_ff_types}} with a simplify method selected.
#' @param join_by Name of column in \code{dt} to join to \code{matches}.
#' @param group_by Name of column(s) in \code{dt} to include and group matches by in output. If specified, rows are sorted by these columns, rows are returned sorted by descending count.
#' @param types Falling Fruit types, as returned by \code{\link{get_ff_types}}.
#' @param saved_table Previous result with saved edits.
#' @param locales Additional common names displayed in match results.
#' @export
#' @family location import functions
build_match_table <- function(dt, matches, join_by = "id", group_by = NULL, types = get_ff_types(pending = FALSE), saved_table = NULL, locales = NULL) {
# Initial match table
# id | types (exact_strings[1] if length = 1) | fuzzy_strings | exact_matches | ...
type_ids <- unique(unlist(matches[, .(exact, fuzzy)]))
if (is.null(locales)) {
type_strings <- sapply(type_ids, function(type_id) {
types[id == type_id, build_type_strings(id, first(common_names.en)[1], first(scientific_names)[1])]
})
} else {
name_fields <- paste("common_names", locales, sep = ".")
type_strings <- sapply(type_ids, function(type_id) {
types[id == type_id, build_type_strings(id, first(common_names.en)[1], first(scientific_names)[1], paste(mapply(paste, locales, .SD, sep = ": "), collapse = ", ")), .SDcols = name_fields]
})
}
selected_strings <- sapply(matches$exact, function(ids) {
if (length(ids) != 1) "" else type_strings[match(ids, type_ids)]
})
exact_strings <- sapply(matches$exact, function(ids) {
if (length(ids) < 2) "" else paste(type_strings[match(ids, type_ids)], collapse = ", ")
})
fuzzy_strings <- sapply(matches$fuzzy, function(ids) {
paste(type_strings[match(ids, type_ids)], collapse = ", ")
})
match_temp <- data.table::data.table(id = matches$id, types = selected_strings, fuzzy_matches = fuzzy_strings, exact_matches = exact_strings)
# Join with location data
dt.group_by <- paste("dt", group_by, sep = ".")
dt_subset <- dt[, union(join_by, group_by), with = FALSE]
data.table::setnames(dt_subset, group_by, dt.group_by)
merged <- merge(dt_subset, match_temp, by.x = join_by, by.y = "id", all = TRUE)
# Group by grouping columns
match_table <- merged[, .(unverified = "", season.start = "", season.stop = "", no.season = "", count = .N, id = paste(id, collapse = ",")), by = c(dt.group_by, "types", "fuzzy_matches", "exact_matches")][order(count, decreasing = TRUE)]
if (!is.null(group_by)) {
data.table::setorderv(match_table, paste("dt", group_by, sep = "."))
}
# Update from saved table
if (!is.null(saved_table)) {
if (!identical(names(match_table), names(saved_table)) || !identical(match_table$name, saved_table$name) || !identical(match_table$id, saved_table$id)) {
stop("Saved table does not match structure of new results.")
}
saved_fields <- c("types", "unverified", "season.start", "season.stop", "no.season")
for (field in saved_fields) {
match_table[, field := ifelse(is.empty(saved_table[[field]]), match_table[[field]], saved_table[[field]]), with = FALSE]
}
}
# Return final table
return(match_table[])
}
#' Apply Match Table to Locations
#'
#' @param dt Locations data.
#' @param match_table Type assignments, as returned by \code{\link{match_names_to_ff_types}}.
#' @param drop Whether to drop unassigned rows in \code{dt}.
#' @param types Falling Fruit types, as returned by \code{\link{get_ff_types}}.
#' @export
#' @family location import functions
apply_match_table <- function(dt, match_table, drop = FALSE, types = get_ff_types(pending = FALSE)) {
# Normalize (and validate) types
match_table$types <- normalize_type_strings(match_table$types, types)
# Verify completeness
if (any(is.empty(match_table$types)) && !drop) {
stop("Unassigned match_table rows (empty types field). Use drop = TRUE to drop corresponding dt rows.")
}
# Assign types
id_lists <- strsplit(match_table$id, split = "\\s*,\\s*")
temp <- match_table[rep(1:.N, sapply(id_lists, length))][, id := unlist(id_lists)]
FIELDS <- intersect(c("id", "types", "unverified", "season.start", "season.stop", "no.season"), names(temp))
merged <- merge(dt, temp[, FIELDS, with = FALSE], by = "id")
# Drop unassigned rows
if (drop) {
unassigned <- is.empty(merged$types) | merged$types == "NA"
if (sum(unassigned) > 0) {
cat(paste("Dropping", sum(unassigned), "unassigned dt row(s)."), sep = "\n")
merged <- merged[!unassigned]
}
}
return(merged)
}
#' Aggregate Locations by Position
#'
#' Falling Fruit does not support overlapping locations. This function merges locations with the same \code{lat}, \code{lng} or \code{address} by the methods described in Details.
#'
#' The following aggregation is applied to all non-missing values for each field:
#'
#' \itemize{
#' \item \code{ids, types, author} - Comma-delimited list of all unique values.
#' \item \code{description} - Result of \code{\link{build_location_description}(types = description, notes = notes)}.
#' \item \code{season.start} - Minimum value.
#' \item \code{season.stop} - Maximum value.
#' \item \code{unverified} - True ('x') if any are true.
#' \item \code{yield.rating, quality.rating} - Rounded mean of all values.
#' \item \code{photo.url} - First value.
#' }
#'
#' @param dt Locations data.
#' @param ... Arguments passed to \code{\link{build_location_description}}.
#' @export
#' @family location import functions
merge_overlapping_locations <- function(dt, ...) {
# Select position fields
if (all(c("lat", "lng") %in% names(dt))) {
position_fields <- c("lat", "lng")
} else if ("address" %in% names(dt)) {
position_fields <- "address"
} else {
stop("No position fields found (lat,lng | address).")
}
# Add missing fields
dt <- data.table::copy(dt)
fields <- gsub(" ", ".", tolower(Location_import_fields))
missing_fields <- setdiff(c(fields, "notes"), names(dt))
if (length(missing_fields) > 0) {
dt[, (missing_fields) := NA_character_]
}
# Cast field types
if (!is.list(dt$notes)) {
dt[, notes := as.list(notes)]
}
if (!is.character(dt$id)) {
dt[, id := as.character(id)]
}
# Merge locations by position fields
is_overlapping <- duplicated(dt, by = position_fields) | duplicated(dt, by = position_fields, fromLast = TRUE)
dt_single <- dt[!is_overlapping][, description := build_location_descriptions(description, notes, ...)]
data.table::setnames(dt_single, "id", "ids")
dt_multi <- dt[is_overlapping, .(
ids = paste(stats::na.omit(unique(id)), collapse = ", "),
types = paste(stats::na.omit(unique(types)), collapse = ", "),
description = build_location_description(description, notes, ...),
# FIXME: May not work for seasons spanning two calendar years.
season.start = if (all(is.na(season.start))) NA_integer_ else as.integer(min(season.start, na.rm = TRUE)),
season.stop = if (all(is.na(season.stop))) NA_integer_ else as.integer(max(season.stop, na.rm = TRUE)),
no.season = if (any(grepl("^x$|^t$|^true$", no.season, ignore.case = TRUE))) "x" else NA_character_,
access = unique_na(access),
unverified = if (any(grepl("^x$|^t$|^true$", unverified, ignore.case = TRUE))) "x" else NA_character_,
yield.rating = if (all(is.na(yield.rating))) NA_integer_ else as.integer(round(mean(yield.rating, na.rm = TRUE))),
quality.rating = if (all(is.na(quality.rating))) NA_integer_ else as.integer(round(mean(quality.rating, na.rm = TRUE))),
author = paste(stats::na.omit(unique(author)), collapse = ", "),
photo.url = photo.url[1]
), by = position_fields]
# Return merged locations
field_subset <- intersect(intersect(fields, names(dt_single)), names(dt_multi))
return(rbind(dt_single[, field_subset, with = FALSE], dt_multi[, field_subset, with = FALSE]))
}
#' Write Locations to File for Import
#'
#' @param dt Locations data.
#' @param file Path to file.
#' @export
#' @family location import functions
write_locations_for_import <- function(dt, file) {
# Keep only recognized fields
fields <- gsub(" ", ".", tolower(Location_import_fields))
if (any(duplicated(intersect(names(dt), fields)))) {
stop("Duplicate import field names found.")
}
dt <- dt[, intersect(fields, names(dt)), with = FALSE]
# Add missing fields
missing_fields <- setdiff(fields, names(dt))
if (length(missing_fields) > 0) {
dt[, (missing_fields) := NA]
}
# Order fields
data.table::setcolorder(dt, fields)
# Rename fields
data.table::setnames(dt, Location_import_fields[match(names(dt), fields)])
# Write result to file
utils::write.csv(dt, file, na = "", row.names = FALSE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.