R/internal_clean_dataset.R
In CoordinateCleaner: Automated Cleaning of Occurrence Records from Biological Collections

# 1. The autocorrelation function returning a gamma vector, paramters =
# nbins and roudning, the latter = 0 for now
#' @importFrom graphics hist plot abline segments
#' @importFrom stats cov quantile aggregate
.CalcACT <- function(data, 
                     digit_round = 0, 
                     nc = 3000, 
                     graphs = TRUE, 
                     graph_title = "Title",
                     rarefy = FALSE) {
  if (rarefy) {
    data <- unique(data)
  }

  data_units <- sort(abs(data))

  if (digit_round > 0) {
    # if set to 10 takes only units into account
    data_units <- data_units - 
      round(floor(data_units / digit_round) * digit_round)
  }

  if (graphs) {
    h <- graphics::hist(data_units, nclass = nc, main = graph_title)
  } else {
    h <- graphics::hist(data_units, nclass = nc, plot = FALSE)
  }

  f <- h$counts
  max_range <- round(length(f) * 0.9)
  gamma_0 <- stats::cov(f[1:max_range], f[1:max_range])

  gamma_vec <- c()

  for (k in 1:max_range) {
    f_0 <- f[-(1:k)]
    f_k <- f[-((length(f) - k + 1):(length(f)))]
    gamma_vec <- c(gamma_vec, stats::cov(f_0, f_k) / gamma_0)
  }

  # add coordinates
  coords <- h$mids[1:max_range]

  # plot outlier bins vs coordinates
  if (graphs) {
    plot(coords, gamma_vec)
  }

  # create output data.frame, with the gamma vector and the respective
  # coordinates

  out <- data.frame(gamma = gamma_vec, coords = coords)

  return(out)
}

# 2.function to run a sliding window over the gamma vector, identifying
# outliers, using the interquantile range. Two parameters: window size
# (fixed at 10 points for now) and the outlier threshold (T1, this is the
# most important paramter for the function). The function returns the number
# of non-consecutive 1 (= outlier found)
.OutDetect <- function(x, T1 = 7, 
                       window_size = 10, 
                       detection_rounding = 2,
                       detection_threshold = 6, 
                       graphs = TRUE) {
  
  # The maximum range end for the sliding window
  max_range <- nrow(x) - window_size 

  out <- matrix(ncol = 2)

  for (k in 1:max_range) {
    # sliding window
    sub <- x[k:(k + window_size), ] # sliding window

    # interquantile range outlier detection
    quo <- stats::quantile(sub$gamma, c(0.25, 0.75), na.rm = TRUE)
    outl <- matrix(nrow = nrow(sub), ncol = 2)
    outl[, 1] <- sub$gamma > (quo[2] + stats::IQR(sub$gamma) * T1) 
    outl[, 2] <- sub$coord
    out <- rbind(out, outl)
  }

  out <- stats::aggregate(out[, 1] ~ out[, 2], FUN = "sum")
  names(out) <- c("V1", "V2")
  out <- out[, c(2, 1)]
  # only 'outliers' that have at least been found by at least 6 sliding windows
  out[, 1] <- as.numeric(out[, 1] >= detection_threshold) 

  # A distance matrix between the outliers we use euclidean space, because 1)
  # we are interest in regularity, not so much the absolute distance, 2) grids
  # might often be in lat/lon and 3) most grids wil only spann small spatial
  # extent
  outl <- out[out[, 1] == 1, ]

  if (nrow(outl) == 0) {
    out <- data.frame(n.outliers = 0, 
                      n.regular.outliers = 0, 
                      regular.distance = NA)
  } else if (nrow(outl) == 1) {
    if (graphs) {
      graphics::abline(v = outl[, 2], col = "green")
    }

    out <- data.frame(n.outliers = 1, 
                      n.regular.outliers = 0, 
                      regular.distance = NA)
  } else {
    # add the identified outliers to the plot
    if (graphs) {
      graphics::abline(v = outl[, 2], col = "green")
    }

    # calculate the distance between outliers
    dist_m <- round(stats::dist(round(outl[, 2, drop = FALSE], 
                                      detection_rounding),
                                diag = FALSE), 
                    detection_rounding)
    dist_m[dist_m > 2] <- NA

    if (length(dist_m) == sum(is.na(dist_m))) {
      out <- data.frame(
        n.outliers = nrow(outl), n.regular.outliers = 0,
        regular.distance = NA
      )
    } else {
      # process distance matrix
      dist_m[dist_m < 10^(-detection_rounding)] <- NA
      dists <- c(dist_m)
      dists <- sort(table(dists))
      dist_m <- as.matrix(dist_m)
      dist_m[row(dist_m) <= col(dist_m)] <- NA

      # select those with the most common distance
      # find the most common distance between points
      com_dist <- as.numeric(names(which(dists == max(dists)))) 

      # if there is more than one probably no bias
      sel <- which(dist_m %in% com_dist[1])
      # identify rows with at least one time the most common distance
      sel <- unique(arrayInd(sel, dim(dist_m))) 

      # sel <- unique(as.numeric(colnames(dist_m)[sel[,1]]))

      reg_outl <- cbind(outl[sel[, 1], 2], outl[sel[, 2], 2])

      if (graphs) {
        # find the right y to plot the segments
        y0 <- max(x$gamma)

        if (y0 < 0.3 | y0 > 2) {
          y1 <- max(x$gamma) - max(x$gamma) / nrow(reg_outl)
        } else {
          y1 <- max(x$gamma) - 0.1
        }
        ys <- seq(y0, y1, by = -((y0 - y1) / (nrow(reg_outl) - 1)))

        segments(
          x0 = reg_outl[, 1], x1 = reg_outl[, 2], y0 = ys, y1 = ys,
          col = "red"
        )
      }
      # output: number of outliers, number of outliers with the most common
      # distance, the most common distance
      out <- data.frame(
        n.outliers = nrow(outl), n.regular.outliers = nrow(reg_outl),
        regular.distance = com_dist[1]
      )
    }
  }
  return(out)
}

Any scripts or data that you put into this service are public.

CoordinateCleaner documentation built on Oct. 25, 2023, 1:08 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

CoordinateCleaner
Automated Cleaning of Occurrence Records from Biological Collections

R/internal_clean_dataset.R
In CoordinateCleaner: Automated Cleaning of Occurrence Records from Biological Collections

Defines functions .OutDetect .CalcACT

Try the CoordinateCleaner package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CoordinateCleaner Automated Cleaning of Occurrence Records from Biological Collections

R/internal_clean_dataset.R In CoordinateCleaner: Automated Cleaning of Occurrence Records from Biological Collections

Defines functions .OutDetect .CalcACT

Try the CoordinateCleaner package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CoordinateCleaner
Automated Cleaning of Occurrence Records from Biological Collections

R/internal_clean_dataset.R
In CoordinateCleaner: Automated Cleaning of Occurrence Records from Biological Collections