CoordinateCleaner: Automated Cleaning of Occurrence Records from Biological Collections

Documented in cd_round

#' Identify Datasets with Rasterized Coordinates
#' 
#' Flags datasets with periodicity patterns indicative of a rasterized
#' (lattice) collection scheme, as often obtain from e.g. atlas data. Using a
#' combination of autocorrelation and sliding-window outlier detection to
#' identify periodicity patterns in the data. See 
#' \url{https://besjournals.onlinelibrary.wiley.com/doi/full/10.1111/2041-210X.13152}
#' for further details and 
#' a description of the algorithm
#' 
#' 
#' @param ds a character string. The column with the dataset of each record. In
#' case \code{x} should be treated as a single dataset, identical for all
#' records.  Default = \dQuote{dataset}.
#' @param T1 numeric.  The threshold for outlier detection in a in an
#' interquantile range based test. This is the major parameter to specify the
#' sensitivity of the test: lower values, equal higher detection rate. Values
#' between 7-11 are recommended. Default = 7.
#' @param reg_out_thresh numeric. Threshold on the number of equal distances
#' between outlier points.  See details.  Default = 2.
#' @param reg_dist_min numeric.  The minimum detection distance between
#' outliers in degrees (the minimum resolution of grids that will be flagged).
#' Default = 0.1.
#' @param reg_dist_max numeric.  The maximum detection distance between
#' outliers in degrees (the maximum resolution of grids that will be flagged).
#' Default = 2.
#' @param min_unique_ds_size numeric.  The minimum number of unique locations
#' (values in the tested column) for datasets to be included in the test.
#' Default = 4.
#' @param graphs logical. If TRUE, diagnostic plots are produced.  Default =
#' TRUE.
#' @param test character string.  Indicates which column to test. Either
#' \dQuote{lat} for latitude, \dQuote{lon} for longitude, or \dQuote{both} for
#' both.  In the latter case datasets are only flagged if both test are failed.
#' Default = \dQuote{both}
#' @inheritParams cc_cap
#' 
#' @return Depending on the \sQuote{value} argument, either a \code{data.frame}
#' with summary statistics and flags for each dataset (\dQuote{dataset}) or a
#' \code{data.frame} containing the records considered correct by the test
#' (\dQuote{clean}) or a logical vector (\dQuote{flagged}), with TRUE = test passed and FALSE =
#' test failed/potentially problematic. Default =
#' \dQuote{clean}.
#' 
#' @note See \url{https://ropensci.github.io/CoordinateCleaner/} for more
#' details and tutorials.
#' 
#' @keywords "Coordinate cleaning" "Dataset level cleaning"
#' @family Datasets
#' 
#' @examples
#' 
#' #simulate bias grid, one degree resolution, 10% error on a 1000 records dataset
#' #simulate biased fraction of the data, grid resolution = 1 degree
#' #simulate non-biased fraction of the data
#'   bi <- sample(3 + 0:5, size = 100, replace = TRUE)
#'   mu <- runif(3, 0, 15)
#'   sig <- runif(3, 0.1, 5)
#'   cl <- rnorm(n = 900, mean = mu, sd = sig)
#'   lon <- c(cl, bi)
#'   
#'   bi <- sample(9:13, size = 100, replace = TRUE)
#'   mu <- runif(3, 0, 15)
#'   sig <- runif(3, 0.1, 5)
#'   cl <- rnorm(n = 900, mean = mu, sd = sig)
#'   lat <- c(cl, bi)
#'   
#'   #add biased data
#'   
#'   inp <- data.frame(decimalLongitude = lon,
#'                     decimalLatitude = lat,
#'                     dataset = "test")
#'             
#'           
#'   #run test
#'   \dontrun{
#'   cd_round(inp, value = "dataset")
#'   }
#'   
#' 
#' @export
#' @importFrom stats complete.cases
#' @importFrom graphics title
cd_round <- function(x, 
                     lon = "decimalLongitude", 
                     lat = "decimalLatitude", 
                     ds = "dataset",
                     T1 = 7, 
                     reg_out_thresh = 2, 
                     reg_dist_min = 0.1, 
                     reg_dist_max = 2, 
                     min_unique_ds_size = 4,
                     graphs = TRUE, 
                     test = "both", 
                     value = "clean",
                     verbose = TRUE) {

  # fix operators which are tuned and people most likely do not want to change
  window_size <- 10
  detection_rounding <- 2
  detection_threshold <- 6
  digit_round <- 0
  nc <- 3000
  rarefy <- FALSE

  match.arg(value, choices = c("flagged", "clean", "dataset"))

  if (verbose) {
    message("Testing for rasterized collection")
  }
  
  if (length(unique(x[[ds]])) > 1) {
    dat <- split(x, f = x[[ds]])

    out <- lapply(dat, function(k) {
      tester <- k[complete.cases(k[, c(lon, lat)]), ]
      if (nrow(tester[!duplicated(tester[, c(lon, lat)]), ]) < 
          min_unique_ds_size) {
        warning("Dataset smaller than minimum test size")
        if (test == "lon") {
          out <- data.frame(
            dataset = unique(k[[ds]]), lon.n.outliers = NA,
            lon.n.regular.distance = NA, lon.regular.distance = NA, summary = NA
          )
        }
          if (test == "lat") {
            out <- data.frame(
              dataset = unique(k[[ds]]), lat.n.outliers = NA,
              lat.n.regular.distance = NA, lat.regular.distance = NA, summary = NA
            )
          }
          if (test == "both") {
            out <- data.frame(
              dataset = unique(k[[ds]]),
              lon.n.outliers = NA,
              lon.n.regular.outliers = NA,
              lon.regular.distance = NA,
              lon.flag = NA,
              lat.n.outliers = NA,
              lat.n.regular.outliers = NA,
              lat.regular.distance = NA,
              lat.flag = NA, 
              summary = NA
            )
          }      } else {
        if (test == "lon") {
          # calculate autocorrelation
          gvec <- .CalcACT(
            data = k[[lon]], digit_round = digit_round,
            nc = nc, graphs = graphs, graph_title = unique(k[[ds]])
          )
          # run the sliding window outlier detection
          n_outl <- .OutDetect(gvec,
            T1 = T1, window_size = window_size,
            detection_rounding = detection_rounding, 
            detection_threshold = detection_threshold,
            graphs = graphs
          )

          n_outl$flag <- !all(n_outl$n.outliers > 0, 
                              n_outl$regular.distance >= reg_dist_min, 
                              n_outl$regular.distance <= reg_dist_max, 
                              n_outl$n.regular.outliers >= reg_out_thresh)

          if (graphs) {
            title(paste(unique(k[[ds]]), n_outl$flag, sep = " - "))
          }
          n_outl <- data.frame(unique(k[[ds]]), n_outl)
          names(n_outl) <- c(
            "dataset", "lon.n.outliers", "lon.n.regular.distance",
            "lon.regular.distance", "summary"
          )
        }

        if (test == "lat") {
          gvec <- .CalcACT(
            data = k[[lat]], 
            digit_round = digit_round,
            nc = nc, graphs = graphs, 
            graph_title = unique(k[[ds]])
          )
          # run the sliding window outlier detection
          n_outl <- .OutDetect(gvec,
            T1 = T1, window_size = window_size,
            detection_rounding = detection_rounding, 
            detection_threshold = detection_threshold,
            graphs = graphs
          )

          n_outl$flag <- !all(n_outl$n.outliers > 0, 
                              n_outl$regular.distance >= reg_dist_min, 
                              n_outl$regular.distance <= reg_dist_max, 
                              n_outl$n.regular.outliers >= reg_out_thresh)

          if (graphs) {
            title(paste(unique(k[[ds]]), n_outl$flag, sep = " - "))
          }
          
          n_outl <- data.frame(unique(k[[ds]]), n_outl)
          names(n_outl) <- c(
            "dataset", "lat.n.outliers", "lat.n.regular.distance",
            "lat.regular.distance", "summary"
          )
        }

        if (test == "both") {
          gvec1 <- .CalcACT(
            data = k[[lon]], digit_round = digit_round,
            nc = nc, graphs = graphs, graph_title = unique(k[[ds]])
          )
          n_outl_lon <- .OutDetect(gvec1,
            T1 = T1, window_size = window_size,
            detection_rounding = detection_rounding, 
            detection_threshold = detection_threshold,
            graphs = graphs
          )

          n_outl_lon$flag <- !all(
            n_outl_lon$n.outliers > 0, n_outl_lon$regular.distance >=
              reg_dist_min, n_outl_lon$regular.distance <= reg_dist_max,
            n_outl_lon$n.regular.outliers >= reg_out_thresh
          )

          if (graphs) {
            title(paste(unique(k[[ds]]), n_outl_lon$flag, sep = " - "))
          }

          gvec2 <- .CalcACT(
            data = k[[lat]], digit_round = digit_round,
            nc = nc, graphs = graphs, graph_title = unique(k[[ds]])
          )
          n_outl_lat <- .OutDetect(gvec2,
            T1 = T1, window_size = window_size,
            detection_rounding = detection_rounding, 
            detection_threshold = detection_threshold,
            graphs = graphs
          )

          n_outl_lat$flag <- !all(
            n_outl_lat$n.outliers > 0, n_outl_lat$regular.distance >=
              reg_dist_min, n_outl_lat$regular.distance <= reg_dist_max,
            n_outl_lat$n.regular.outliers >= reg_out_thresh
          )

          if (graphs) {
            title(paste(unique(k[[ds]]), n_outl_lat$flag, sep = " - "))
          }

          n_outl <- cbind(unique(k[[ds]]), n_outl_lon, n_outl_lat)
          names(n_outl) <- c(
            "dataset", 
            "lon.n.outliers", 
            "lon.n.regular.outliers",
            "lon.regular.distance", 
            "lon.flag", 
            "lat.n.outliers", 
            "lat.n.regular.outliers",
            "lat.regular.distance", "lat.flag"
          )

          n_outl$summary <- n_outl$lon.flag | 
            n_outl$lat.flag # only flag if both are flagged
        }

        return(n_outl)
      }
    })

    out <- do.call("rbind.data.frame", out)
  } else {
    if (nrow(x[!duplicated(x[, c(lon, lat)]), ]) < min_unique_ds_size) {
      warning("Dataset smaller than minimum test size")
      out <- data.frame(
        dataset = unique(x[[ds]]), 
        n.outliers = NA, 
        n.regular.outliers = NA,
        regular.distance = NA, 
        summary = NA
      )
    } else {
      if (test == "lon") {
        # calculate autocorrelation
        gvec <- .CalcACT(
          data = x[[lon]], 
          digit_round = digit_round,
          nc = nc, 
          graphs = graphs, 
          graph_title = unique(x[[ds]])
        )
        # run the sliding window outlier detection
        n_outl <- .OutDetect(gvec,
          T1 = T1, window_size = window_size,
          detection_rounding = detection_rounding, 
          detection_threshold = detection_threshold,
          graphs = graphs
        )

        n_outl$flag <- !all(n_outl$n.outliers > 0, 
                            n_outl$regular.distance >= reg_dist_min, 
                            n_outl$regular.distance <= reg_dist_max, 
                            n_outl$n.regular.outliers >= reg_out_thresh)

        if (graphs) {
          title(paste(unique(x[[ds]]), n_outl$flag, sep = " - "))
        }

        n_outl <- data.frame(unique(x[[ds]]), n_outl)
        names(n_outl) <- c(
          "dataset", "lon.n.outliers", "lon.n.regular.distance",
          "lon.regular.distance", "summary"
        )
      }

      if (test == "lat") {
        gvec <- .CalcACT(
          data = x[[lat]], digit_round = digit_round,
          nc = nc, graphs = graphs, graph_title = unique(x[[ds]])
        )
        # run the sliding window outlier detection
        n_outl <- .OutDetect(gvec,
          T1 = T1, window_size = window_size,
          detection_rounding = detection_rounding, 
          detection_threshold = detection_threshold,
          graphs = graphs
        )

        n_outl$flag <- !all(n_outl$n.outliers > 0, 
                            n_outl$regular.distance >= reg_dist_min, 
                            n_outl$regular.distance <= reg_dist_max, 
          n_outl$n.regular.outliers >= reg_out_thresh)

        if (graphs) {
          title(paste(unique(x[[ds]]), n_outl$flag, sep = " - "))
        }

        n_outl <- data.frame(unique(x[[ds]]), n_outl)
        names(n_outl) <- c(
          "dataset", "lat.n.outliers", "lat.n.regular.distance",
          "lat.regular.distance", "summary"
        )
      }

      if (test == "both") {
        gvec1 <- .CalcACT(
          data = x[[lon]], digit_round = digit_round,
          nc = nc, graphs = graphs, graph_title = unique(x[[ds]])
        )

        n_outl_lon <- .OutDetect(gvec1,
          T1 = T1, window_size = window_size,
          detection_rounding = detection_rounding, 
          detection_threshold = detection_threshold,
          graphs = graphs
        )

        n_outl_lon$flag <- !all(
          n_outl_lon$n.outliers > 0, n_outl_lon$regular.distance >=
            reg_dist_min, n_outl_lon$regular.distance <= reg_dist_max,
          n_outl_lon$n.regular.outliers >= reg_out_thresh
        )

        if (graphs) {
          title(paste(unique(x[[ds]]), n_outl_lon$flag, sep = " - "))
        }


        gvec2 <- .CalcACT(
          data = x[[lat]], digit_round = digit_round,
          nc = nc, graphs = graphs, graph_title = unique(x[[ds]])
        )

        n_outl_lat <- .OutDetect(gvec2,
          T1 = T1, window_size = window_size,
          detection_rounding = detection_rounding, 
          detection_threshold = detection_threshold,
          graphs = graphs
        )


        n_outl_lat$flag <- !all(
          n_outl_lat$n.outliers > 0, n_outl_lat$regular.distance >=
            reg_dist_min, n_outl_lat$regular.distance <= reg_dist_max,
          n_outl_lat$n.regular.outliers >= reg_out_thresh
        )

        if (graphs) {
          title(paste(unique(x[[ds]]), n_outl_lat$flag, sep = " - "))
        }

        n_outl <- data.frame(unique(x[[ds]]), n_outl_lon, n_outl_lat)
        names(n_outl) <- c(
          "dataset", "lon.n.outliers", 
          "lon.n.regular.distance",
          "lon.regular.distance", 
          "lon.flag", 
          "lat.n.outliers", 
          "lat.n.regular.distance",
          "lat.regular.distance", 
          "lat.flag"
        )

        n_outl$summary <- n_outl$lon.flag | 
          n_outl$lat.flag # only flag if both are flagged
      }
      out <- n_outl
    }
  }

  switch(value, 
         dataset = return(out), 
         clean = return({test <- x[x[[ds]] %in% out[out$summary, "dataset"], ]
    if (length(test) > 0) {
      test
    } else {
      NULL
    }
  }), 
  flagged = return(x[[ds]] %in% out[out$summary, "dataset"]))
}
azizka/CoordinateCleaner documentation built on March 10, 2024, 8:32 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
azizka/CoordinateCleaner
Automated Cleaning of Occurrence Records from Biological Collections

R/cd_round.R
In azizka/CoordinateCleaner: Automated Cleaning of Occurrence Records from Biological Collections

Defines functions cd_round

Documented in cd_round

R Package Documentation

Browse R Packages

We want your feedback!

azizka/CoordinateCleaner Automated Cleaning of Occurrence Records from Biological Collections

R/cd_round.R In azizka/CoordinateCleaner: Automated Cleaning of Occurrence Records from Biological Collections

Defines functions cd_round

Documented in cd_round

R Package Documentation

Browse R Packages

We want your feedback!

azizka/CoordinateCleaner
Automated Cleaning of Occurrence Records from Biological Collections

R/cd_round.R
In azizka/CoordinateCleaner: Automated Cleaning of Occurrence Records from Biological Collections