knitr::opts_chunk$set(echo = TRUE) library(caret) library(CoordinateCleaner) library(dplyr) library(ggplot2) library(knitr) library(magrittr) library(terra) library(readr) library(tidyr) library(viridis)
# A function for generating captions and cross-references fig <- local({ i <- 0 list( cap=function(refName, text, center=FALSE, col="black", inline=FALSE) { i <<- i + 1 ref[[refName]] <<- i css_ctr <- "" if (center) css_ctr <- "text-align:center; display:inline-block; width:100%;" cap_txt <- paste0("<span style=\"color:", col, "; ", css_ctr, "\">Figure ", i, ": ", text , "</span>") anchor <- paste0("<a name=\"", refName, "\"></a>") if (inline) { paste0(anchor, cap_txt) } else { list(anchor=anchor, cap_txt=cap_txt) } }, ref=function(refName, link=FALSE, checkRef=TRUE) { ## This function puts in a cross reference to a caption. You refer to the ## caption with the refName that was passed to fig$cap() (not the code chunk name). ## The cross reference can be hyperlinked. if (checkRef && !refName %in% names(ref)) stop(paste0("fig$ref() error: ", refName, " not found")) if (link) { paste0("<A HREF=\"#", refName, "\">Figure ", ref[[refName]], "</A>") } else { paste0("Figure ", ref[[refName]]) } }, ref_all=function(){ ## For debugging ref }) }) ## This chunk replaces the default hook for processing plots. It achieves the purposes, ## of laying out auto-numbered captions, but other functionality may be gone. knit_hooks$set(plot = function(x, options) { sty <- "" if (options$fig.align == 'default') { sty <- "" } else { sty <- paste0(" style=\"text-align:", options$fig.align, ";\"") } if (is.list(options$fig.cap)) { ## options$fig.cap is a list returned by the function fig$cap() str_caption <- options$fig.cap$cap_txt str_anchr <- options$fig.cap$anchor } else { ## options$fig.cap is a character object (hard coded, no anchor) str_caption <- options$fig.cap str_anchr <- "" } paste('<figure', sty, '>', str_anchr, '<img src="', opts_knit$get('base.url'), paste(x, collapse = '.'), '"><figcaption>', str_caption, '</figcaption></figure>', sep = '') }) ## This chucnk will read through *this* Rmd file, and attempt to extract all of the ## labels (not caption text) used for Figure captions. These labels are used ## as anchors, so scanning through the document now will allow us to create cross references ## before the caption actually appears. ## Get the name of this Rmd file rmdFn <- knitr::current_input() # filename of input document ## Read lines and close connection rmdCon <- file(rmdFn, open = "r") rmdLines <- readLines(rmdCon) close(rmdCon) ## Pull out all occurences of at least one back tick, followed ## by any number of characters, followed by fig$cap (all on one line) figscap_idx <- grep("`+(.*)fig\\$cap", rmdLines) rmdLines <- rmdLines[figscap_idx] ## Get rid of everything up until the start of the caption label ## This presumes the caption label is the first argument of fig$cap() ## E.g., fig.cap = fig$cap("my_label", ...) rmdLinesSansPre <- sub("(.*)fig\\$cap(.*?)[\"']", "", rmdLines) ## Identify everything up until the first quote match_data <- regexpr("(.*?)[\"']", rmdLinesSansPre) ## Reduce the length by one, because we're not interested in the final quote attr(match_data, "match.length") <- attr(match_data, "match.length") - 1 ## Extract fig_labels <- regmatches(rmdLinesSansPre, match_data, invert=FALSE) if (length(fig_labels) > 0) { ## Test for duplicates if (anyDuplicated(fig_labels) > 0) stop("Duplicate caption labels detected") ## Create a named list of Figure numbers ref <- as.list(1:length(fig_labels)) names(ref) <- fig_labels }
Some problems with biological collection data are not apparent from individual records, but rather linked to properties of an entire data set. The CleanCoordinatesDS function can use dataset properties to flag three types of potential problems, under the assumption that these problems will effect many records in a dataset from the same source (but not necessarily all):
An erroneous conversion of coordinates in degree minute annotation into decimal degrees, where the decimal sign is erroneously translated into the decimal delimiter, e.g. 10°30' to 10.3 °. This is a problem that has been observed in particular for older data sets.
A periodicity in the decimals of a data set, as will arise when coordinates are either rounded or recorded in a raster format and coordinates represent raster cell centres. This problem represents low precision rather than errors, but can also be fatal, if undetected and taken as actual localities, for example in distribution modelling.
cd_ddmm
)Geographic coordinates in a longitude/latitude coordinate reference system can be noted in different ways. The most common notations are degree minutes seconds (ddmm, e.g. 38°54'22'') and decimal degrees (dd.dd, e.g. 38.90611°). A hybrid annotation of degrees with decimal minutes is also sometimes used (e.g. 38°54.367). However, analyses using distribution data almost exclusively rely on the machine readable decimal-degree format. Therefore the diversity of formats is challenging for databases comprised of coordinate records from different sources which potentially use different annotation formats. Systematic errors can arise, especially if old data are combined and digitized automatically, without appropriate conversion. A particular problem reported repeatedly is the misinterpretation of the degree sign (°) as the decimal delimiter (e.g. 38°54' converted to 38.54°), which leads to biased geographic occurrence information. A particular caveat in identifying these problems post-hoc in a database compiled from many sources is that biased records might be mixed with unproblematic records. For instance, specimens from a certain herbarium might have been digitized in different instances in a way, that part of the records are biased whereas others are not.
As part of this study we present a novel algorithm to identify data sets potentially biased by erroneous conversion from ddmm to dd.dd due to the misinterpretation of the degree sign as decimal delimiter.
load(file = "inst/analyses_matrix_Clean") dat0 <- raster(dat.t1)%>% rasterToPoints()%>% as.data.frame()%>% mutate(layer = ifelse(layer == 0, NA, layer))%>% mutate(layer = as.character(layer)) %>% mutate(layer = parse_factor(layer, levels = c(NA,"1")))%>% mutate(ds = "Error fraction 0") load(file = "inst/analyses_matrix_Error_fraction_0.2") dat02 <- raster(dat.t1)%>% rasterToPoints()%>% as.data.frame()%>% mutate(layer = ifelse(layer == 0, NA, layer))%>% mutate(layer = as.character(layer)) %>% mutate(layer = parse_factor(layer, levels = c(NA,"1")))%>% mutate(ds = "Error fraction 0.2") load("inst/analyses_matrix_Error_fraction_0.5") dat08 <- raster(dat.t1)%>% rasterToPoints()%>% as.data.frame()%>% mutate(layer = ifelse(layer == 0, NA, layer))%>% mutate(layer = as.character(layer)) %>% mutate(layer = parse_factor(layer, levels = c(NA,"1")))%>% mutate(ds = "Error fraction 0.5") load("inst/analyses_matrix_Error_fraction_1") dat1 <- raster(dat.t1)%>% rasterToPoints()%>% as.data.frame()%>% mutate(layer = ifelse(layer == 0, NA, layer))%>% mutate(layer = as.character(layer)) %>% mutate(layer = parse_factor(layer, levels = c(NA,"1")))%>% mutate(ds = "Error fraction 1") dat <- bind_rows(dat0, dat02, dat08, dat1) dat$ds <- factor(dat$ds, levels = c("Error fraction 0", "Error fraction 0.2", "Error fraction 0.5", "Error fraction 1")) ggplot(data = dat)+ geom_raster(aes(x = x, y = y, fill = layer))+ scale_fill_manual(values = c("darkgreen", "white"))+ scale_y_continuous(limits=c(0, 1), expand = c(0, 0))+ scale_x_continuous(limits=c(0, 1), expand = c(0, 0))+ xlab ("Longitude decimals")+ ylab("Latitude decimals")+ theme_bw()+ theme(legend.position = "none")+ facet_wrap(~ds)
The binomial test and frequency comparison described in the main text are based on an analysis matrix which is recording the distribution of coordinates decimals in the longitude/latitude space (r fig$ref("fig_analmat", link = TRUE)
). The analysis matrix does not record the number of records in a cell, but only presence/absence, to account for clustered sampling (i.e. a large number of records with similar decimals are most likely not related to conversion error, but rather to multiple samples from the same location, or coordinate rounding, see section 3).
The test is implemented in the cd_ddmm
function and the ddmm
argument of the clean_dataset
wrapper function. The input follows the general package design, and is a data.frame
with at least three columns including the decimal longitude and latitude and a data set identifier for each record. The names of the columns can be specified via the lon
, lat
and ds
arguments. There are three additional arguments to customize the test, in particular to modify its sensitivity to the fraction of a data set that is biased: The pvalue
argument controls the cut-off p-value for significance of the one-sided t-test. The diff
argument controls the threshold difference for the ddmm test
, and indicates by which fraction the records with decimals below 0.6 must outnumber the records with decimals above 0.6. The size of the analysis matrix can be adapted using the mat.size
argument.
We used simulations to asses (A) the effect of varying the diff parameter and (B) the effect of data set size on the performance of the ddmm
test.
We simulated 100,000 datasets of species occurrences with varying number of records and degree of sample clustering. For each iteration we first draw a random number, N as $\Gamma(\alpha = 2, \beta = 1) *500$ for the number of records. We the simulated N latitude and longitude coordinates between 0° an 90° using $K \in [1,5)$ truncated normal distributions with $\mu_i \sim \mathcal{U}(0,90)$ and $\sigma \sim \mathcal{U}(0.1,5)$. We then added a bias by replacing between 0 - 80% of the samples by records with decimals sampled from $\mathcal{U}(0, 0.599)$. We then analysed the simulated data using cd_ddmm
using diff parameters between 0.1 and 1.
r fig$ref("fig_ddmm_diff", link = TRUE)
shows the effect of the diff
parameter on the sensitivity of the cd_ddmm
test and r fig$ref("fig_dataset_size", link = TRUE)
the effect of dataset size. In general, the test is identifying the presence of a bias with rate > 0.1 (10\% bias) well, for datasets with more than 100 individual occurrence records. However, for empirical data we recommend a more conservative diff
threshold of 0.5-1 to identify datasets with more than 30\% bias. This is because smaller bias rates might be caused by irregular sampling rather than conversion errors. However, the advisable diff threshold depends on downstream analyses and higher diff
values might be necessary (r fig$ref("fig_ddmm_diff", link = TRUE)
). We suggest to manually check the decimal distribution in flagged datasets to avoid data loss. This can be done easily by visually inspecting the analyses matrix, by setting the diagnostic
argument of cd_ddmm
to TRUE.
# False positive/negative rates - diff parameter load("inst/simulations_cd_ddmm.Rds") dat <- plo <- simulations_cd_ddmm #plot summary stats with non-bias = 0 plo <- plo[!is.na(plo$pass),] plo <- plo%>% mutate(biased = ifelse(bias.rate == 0, F, T))%>% mutate(correct = pass != biased) #remember: pass = T means not biased/test passed summ <- plo%>% group_by(diff, bias.rate)%>% summarize(true.positive = sum(biased & !pass), true.negative = sum(!biased & pass), false.positive = sum(!biased & !pass), false.negative = sum(biased & pass), sensitivity = true.positive / (true.positive + false.negative), specificity = true.negative / (true.negative + false.negative), precision = true.positive /(true.positive + false.positive), detection.rate = true.positive / (true.positive + false.positive + true.negative + false.negative), n = n(), false.negative.rate = false.negative / n, true.positive.rate = true.positive / n, true.negative.rate = true.negative / n, false.positive.rate = false.positive / n) suma <- summ%>% filter(bias.rate == 0)%>% dplyr::select(diff, false.positives = false.positive, n, false.positive.rate)%>% mutate(false.positive.rate = round(false.positive.rate, 2)) summ <- summ%>% dplyr::select(-true.positive, -true.negative, -false.positive, -false.negative, -n, -precision, -specificity, -detection.rate, -sensitivity)%>% mutate(false.positive.rate = ifelse(bias.rate > 0, NA, false.positive.rate))%>% mutate(true.negative.rate = ifelse(bias.rate > 0, NA, true.negative.rate)) ## plot summary stats plo.sum <- gather(summ, index, value,-diff, -bias.rate) ## diff threshold ggplot(plo.sum)+ geom_point(aes(x = bias.rate, y = value, group = as.factor(diff), col = as.factor(diff)))+ geom_line(aes(x = bias.rate, y = value, group = as.factor(diff), col = as.factor(diff)))+ scale_colour_viridis(discrete = T, name = "Diff threshold")+ facet_wrap(~index, scale = "fixed")+ ylim(0,1)+ xlab("Bias rate")+ ylab("Rate")+ theme_bw()+ theme(legend.position = "bottom")
# False positive/negative rates - dataset size dat$ds <- cut(dat$dataset.size, breaks = c(0,100, 500, 1000, 10000, 20000)) plo <- dat #plot summary stats with non-bias = 0 plo <- plo[!is.na(plo$pass),] plo <- plo%>% mutate(biased = ifelse(bias.rate == 0, F, T))%>% mutate(correct = pass != biased) #remember: pass = T means not biased/test passed summ <- plo%>% group_by(ds, bias.rate)%>% summarize(true.positive = sum(biased & !pass), true.negative = sum(!biased & pass), false.positive = sum(!biased & !pass), false.negative = sum(biased & pass), sensitivity = true.positive / (true.positive + false.negative), specificity = true.negative / (true.negative + false.negative), precision = true.positive /(true.positive + false.positive), detection.rate = true.positive / (true.positive + false.positive + true.negative + false.negative), n = n(), false.negative.rate = false.negative / n, true.positive.rate = true.positive / n, true.negative.rate = true.negative / n, false.positive.rate = false.positive / n) summ <- summ%>% dplyr::select(-true.positive, -true.negative, -false.positive, -false.negative, -n, -precision, -specificity, -detection.rate, -sensitivity)%>% mutate(false.positive.rate = ifelse(bias.rate > 0, NA, false.positive.rate))%>% mutate(true.negative.rate = ifelse(bias.rate > 0, NA, true.negative.rate)) ## plot summary stats plo.sum <- gather(summ, index, value,-ds, -bias.rate) ## diff threshold ggplot(plo.sum)+ geom_point(aes(x = bias.rate, y = value, group = as.factor(ds), col = as.factor(ds)))+ geom_line(aes(x = bias.rate, y = value, group = as.factor(ds), col = as.factor(ds)))+ scale_colour_viridis(discrete = T, name = "Number of records in dataset")+ facet_wrap(~index, scale = "fixed")+ ylim(0,1)+ xlab("Bias rate")+ ylab("Rate")+ theme_bw()+ theme(legend.position = "bottom")
The simulations suggest the expectable false positives for a given diff (p-value constant at 0.025) are low.
knitr::kable(suma, caption = "The rate of false positives at various rates of diff.")
The conversion error test is implemented in the cd_ddmm function. You can easily run the test with few lines of code. By default the test requires a two degree span for each tested dataset to avoid false flags due to habitat restrictions, as for instance on islands or patchy species distributions, for instance islands of forests in grassland. Nevertheless, we consider the cd_ddmm
test a tool to identify potentially problematic datasets rather than for automatic filtering and recommend to double check flagged datasets using summary statistics and the diagnostic plots.
clean <- data.frame(species = letters[1:10], decimalLongitude = runif(100, -180, 180), decimalLatitude = runif(100, -90,90), dataset = "clean") #problematic dataset lon <- sample(0:180, size = 100, replace = TRUE) + runif(100, 0,0.59) lat <- sample(0:90, size = 100, replace = TRUE) + runif(100, 0,0.59) biased <- data.frame(species = letters[1:10], decimalLongitude = lon, decimalLatitude = lat, dataset = "biased") dat <- rbind(clean, biased) # with diagnostic plots and small matrix due to small dataset size and for visualization par(mfrow = c(1,2)) cd_ddmm(x = dat, diagnostic = TRUE, value = "dataset", mat_size = 100) #inspect geographic extent of the flagged dataset to exclude island or patchy habitat min(biased$decimalLongitude) max(biased$decimalLongitude)
The diagnostic plots clearly show the biased distribution of decimals in the biased dataset and the large geographic extent excludes islands or patchy habitats as cause of this distribution.
cd_round
)Species occurrence records with coordinates often do not represent point occurrences but are either derived from rasterized collection designs (e.g. presence/absence in a 100x100 km grid cell) or have been subject to strong decimal rounding. If the relevant meta-data are missing, these issues cannot be identified on the record level, especially if the records have been combined with precise GPS-based point occurrences into data sets of mixed precision. However, knowledge of coordinate precision and awareness of a large number of imprecise records in a data set can be crucial for downstream analyses. For instance, coordinates collected in a 100km x 100km raster might be unsuitable for species distribution modelling on the local and regional scale.
The sensitivity of the algorithms presented in the main text test can be customized based on the raster regularity and the fraction of biased records in a dataset (T1
), the geographic extent of the raster(reg.dist.min
and reg.dist.max
) and the number of raster nodes (reg.out.thresh
).
simDS <- function(n.rec, K = 3, data.range.min = -40, data.range.max = 40, sig.min = 0.1, sig.max = 5, E.res = 0.1, E = 0, nrep){ #simulate bias grid bi.rang <- E.res * 5 bi.stpt <- sample(data.range.min:(data.range.max - bi.rang), size = 1) data.rang <- (data.range.max + 180) - (data.range.min + 180) if(data.rang < bi.rang){ data.range.max = data.range.min + bi.rang bi.stpt <- data.range.min } bi.lon <- bi.stpt + rep(seq(0, E.res * 4,by = E.res), times = round(n.rec * E / 5, 0)) bi.rang <- E.res * 5 bi.stpt <- sample(data.range.min:(data.range.max - bi.rang), size = 1) data.rang <- (data.range.max + 90) - (data.range.min + 90) if(data.rang < bi.rang){ data.range.max = data.range.min + bi.rang bi.stpt <- data.range.min } bi.lat <- bi.stpt + rep(seq(0, E.res * 4,by = E.res), times = round(n.rec * E / 5, 0)) #longitude #simulate the data according to specifications n.dis <- K mu <- runif(n.dis, data.range.min, data.range.max) sig <- runif(n.dis, sig.min, sig.max) #clean data cl.lon <- msm::rtnorm(n = round(n.rec * (1 - E), 0), mean = mu, sd = sig, lower = data.range.min, upper = data.range.max) #latitude n.dis <- K mu <- runif(n.dis, data.range.min, data.range.max) sig <- runif(n.dis, sig.min, sig.max) #clean data cl.lat<- msm::rtnorm(n = round(n.rec * (1 - E), 0), mean = mu, sd = sig, lower = data.range.min, upper = data.range.max) #add biased data inp.lon <- c(cl.lon, bi.lon) inp.lat <- c(cl.lat, bi.lat) inp <- data.frame(decimalLongitude = inp.lon, decimalLatitude = inp.lat, dataset = nrep) results <- data.frame(dataset.size = n.rec, data.range = paste(data.range.min, data.range.max, sep = "_"), number.of.seeds = n.dis, seed.means = paste(mu, collapse = "_"), seed.sigma = paste(sig, collapse = "_"), bias.rate = E, bias.res = E.res, bias.range = E.res * 5, bias.start = bi.stpt, nrep = nrep, stringsAsFactors = F) out <- list(inp, results) return(out) } # Run simulation plus analyses #simulate data sims.cl <- simDS(n.rec = 1000, K = 5, data.range.min = 0, data.range.max = 30, sig.min = 0.1, sig.max = 2, E = 0, E.res = 2, nrep = "No bias") sims.bia <- simDS(n.rec = 1000, K = 5, data.range.min = 0, data.range.max = 30, sig.min = 0.1, sig.max = 2, E = 0.05, E.res = 2, nrep = "10% bias") #run test par(mfrow = c(2,2)) res <- cd_round(x = sims.cl[[1]], lon = "decimalLongitude", ds = "dataset", test = "lon", value = "dataset") res <- cd_round(x = sims.bia[[1]], lon = "decimalLongitude", ds = "dataset", test = "lon", value = "dataset")
We simulated 100,000 datasets of species occurrences with varying number of records and degree of sample clustering. For each iteration we first draw a random number, $N \sim \Gamma(\alpha = 2, \beta = 1) *500$ for the number of records. We the simulated N latitude and longitude coordinates between 0° an 90° using $K \in [1,5)$ truncated normal distributions with $\mu_i \sim \mathcal{U}(0,90)$ and $\sigma \sim \mathcal{U}(0.1,5)$. We then added a fraction $\rho$ of biased records, where $\rho \in[0,0.6]$ (=0-60%). We assumed a rasterized bias with five nodes (i.e. a raster with five rows and five columns). We first sampled a random coordinate as origin for the raster and then sampled the four remaining nodes at a resolution $\tau \in [0.1, 2]$. We then analysed the simulated data using the cd_round
function with T1
parameters between 3 and 13. r fig$ref("fig_clustering_t7_results", link = TRUE)
shows the effect of the T1
parameter on the sensitivity of the ds_ddmm
test.
load("inst/autocorrelation_simulations.Rds") #plot summary stats with non-bias = 0 plo <- plo[!is.na(plo$flag),] plo <- plo%>% # mutate(biased = ifelse(bias.rate < 0.1, F, T))%>% mutate(biased = ifelse(bias.rate == 0, F, T))%>% mutate(correct = flag != biased) #remember: flag = T means not biased/test passed summ <- plo %>% group_by(outlier.threshold, bias.rate)%>% summarize(true.positive = sum(biased & !flag), true.negative = sum(!biased & flag), false.positive = sum(!biased & !flag), false.negative = sum(biased & flag), sensitivity = true.positive / (true.positive + false.negative), specificity = true.negative / (true.negative + false.negative), precision = true.positive /(true.positive + false.positive), detection.rate = true.positive / (true.positive + false.positive + true.negative + false.negative), n = n(), false.negative.rate = false.negative / n, true.positive.rate = true.positive / n, true.negative.rate = true.negative / n, false.positive.rate = false.positive / n) summ <- summ %>% dplyr::select(-true.positive, -true.negative, -false.positive, -false.negative, -n, -precision, -specificity, -detection.rate, -sensitivity)%>% mutate(false.positive.rate = ifelse(bias.rate > 0, NA, false.positive.rate))%>% mutate(true.negative.rate = ifelse(bias.rate > 0, NA, true.negative.rate)) ## plot summary stats plo.sum <- gather(summ, index, value,- outlier.threshold, -bias.rate) ## outlier threshold ggplot(plo.sum)+ geom_point(aes(x = bias.rate, y = value, group = as.factor(outlier.threshold), col = as.factor(outlier.threshold)))+ scale_colour_viridis(discrete = T, name = "Outlier threshold (T1)")+ facet_wrap(~index, scale = "fixed")+ ylim(0,1)+ xlab("Bias rate")+ ylab("Rate")+ theme_bw()+ theme(legend.position = "bottom")
The following example illustrates the use of cd_round
.
#simulate bias grid, one degree resolution, 10% error on a 1000 records dataset #simulate biased fraction of the data, grid resolution = 1 degree #simulate non-biased fraction of the data bi <- sample(3 + 0:5, size = 100, replace = TRUE) mu <- runif(3, 0, 15) sig <- runif(3, 0.1, 5) cl <- rnorm(n = 900, mean = mu, sd = sig) lon <- c(cl, bi) bi <- sample(9:13, size = 100, replace = TRUE) mu <- runif(3, 0, 15) sig <- runif(3, 0.1, 5) cl <- rnorm(n = 900, mean = mu, sd = sig) lat <- c(cl, bi) biased <- data.frame(decimalLongitude = lon, decimalLatitude = lat, dataset = "biased") # simulate unbiased data lon <- runif(n = 1000, min = -30, max = 30) lat <- runif(n = 1000, min = -30, max = 30) clean <- data.frame(decimalLongitude = lon, decimalLatitude = lat, dataset = "clean") dat <- rbind(biased, clean) #run test, only longitude for better visualization par(mfrow = c(2,2)) cd_round(dat, value = "dataset", test = "lon")
If value = "dataset"
the output of cd_round is a table indicating the number of regular outliers (i.e. the raster nodes) and indicating which datasets have been flagged. Additionally, the diagnostic plots clearly show the rasterized pattern in the biased dataset and confirm the automated flag.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.