R/similaritytests.R
In specleanr: Detecting Environmental Outliers in Data Analysis Pipelines

Documented in cosine hamming jaccard multiabsolute ocindex overlap smc sorensen

#identify absolute outliers, proportions and best methods.

#' @noRd
oci <- function(absoluteoutliers, absolute_propn, threshold, listofmethods,
                absolute, props, autothreshold){

  #Repeat step 2 but with only absolute outliers selected using a user-based threshold

  #the method obtained here will act as the standard since it considers all methods in a

  #multidimensional aspect

  absoluteinmethod_matrix <- matrix(NA, nrow = length(absoluteoutliers), ncol = length(listofmethods))

  for (cvii in seq_along(listofmethods)) {

    lsts2 <- listofmethods[[cvii]]

    for (cviii in seq_along(absoluteoutliers)) {

      absoluteout <- absoluteoutliers[cviii]

      absoluteinmethod_matrix[cviii, cvii] <- ifelse(absoluteout%in%lsts2, 1, 0)

    }
  }
  #Obtain absolute outlier for each method

  maximumoutlier_method <- c() #maximum outlier

  for (cvix in 1:ncol(absoluteinmethod_matrix)) maximumoutlier_method[cvix] <- sum(absoluteinmethod_matrix[, cvix])

  #total outliers for each method

  uniquetotalemethod <- c()

  for (cvx in seq_along(listofmethods))  uniquetotalemethod[cvx] <- length(unique(listofmethods[[cvx]]))


  proportion_absoluteness <- c() #proportion of absolute in each method: maximum/total

  for (cvxi in seq_along(maximumoutlier_method)) {

    maxout <- maximumoutlier_method[cvxi]

    totalout <- uniquetotalemethod[cvxi]

    proportion_absoluteness[cvxi] <- maxout/totalout

  }
  #Select best method with maximum absolute outliers
  indx <- which(maximumoutlier_method==max(maximumoutlier_method) & proportion_absoluteness == max(proportion_absoluteness))

  if(length(indx)>1){

    indx <- indx[1] #choose any

  }else if(length(indx)==0){
    indx <- which(proportion_absoluteness == max(proportion_absoluteness))

    if(length(indx)>1) indx <- indx[1]

  }else{
    'Silence'
  }

  getmethodnames <- names(listofmethods)

  bstmethod <- getmethodnames[indx]

  if(absolute==TRUE){

    if(props==FALSE){

      if(autothreshold ==TRUE) return(list(threshold, absoluteoutliers)) else return(absoluteoutliers)

    }else if(props==TRUE){

      propdata <- cbind(absoluteoutliers, absolute_propn)

      return(as.data.frame(propdata))

    }else{
      stop('Choose either TRUE or FASLE to return abolute outlier or the percentage abolutness')
    }

  }else if(absolute==FALSE){

    return(bstmethod)

  }else{
    stop('Choose either TRUE or FALSE for absolute parameter to return outliers or bestmethod. Defualt is FALSE')
  }
}

#' @title Identifies absolute outliers and their proportions for a single species.
#'
#' @param x \code{datacleaner} class for each methods used to identify outliers
#'   in \code{multidetect} function.
#' @param sp \code{string}. Species name or index if multiple species are
#'   considered during outlier detection.
#' @param threshold \code{numeric}. Maximum value to denote an absolute outlier.
#'   The threshold ranges from \code{0}, which indicates a point has not been
#'   flagged by any outlier detection method as an \code{outlier}, to \code{1},
#'   which means the record is an absolute or true outlier since all methods
#'   have identified it. At both extremes, many records are classified at low
#'   threshold values, which may be due to individual method weakness or
#'   strength and data distribution. Also, at higher threshold values, the true
#'   outliers are retained. For example, if ten methods are considered and 9
#'   methods flag a record as an outlier, If a cutoff of 1 is used, then that
#'   particular record is retained. Therefore, the \code{default} cutoff is
#'   \code{0.6}, but \code{autothreshold} can be used to select the appropriate
#'   threshold.
#' @param autothreshold \code{vector}. Identifies the threshold with mean number
#'   of absolute outliers.The search is limited within 0.51 to 1 since
#'   thresholds less than are deemed inappropriate for identifying absolute
#'   outliers. The autothreshold is used when \code{threshold} is set to
#'   \code{NULL}.
#' @param absolute \code{logical}. To output absolute outliers for a species.
#' @param props \code{dataframe}. To output the proportional absoluteness for
#'   each outlier.
#' @param warn \code{logical}. If \strong{TRUE}, warning on whether absolute
#'   outliers obtained at a low threshold is indicated. Default \strong{TRUE}.
#'
#'
#' @return \code{vector} or \code{dataframe} of absolute outliers, best outlier detection method or data frame of absolute outliers and their
#' proportions
#'
#' @export
#'
#' @examples
#'
#' \donttest{
#'
#' data(efidata)
#'
#' danube <- system.file('extdata/danube.shp.zip', package='specleanr')
#'
#' db <- sf::st_read(danube, quiet=TRUE)
#'
#' wcd <- terra::rast(system.file('extdata/worldclim.tiff', package = "specleanr"))
#'
#'
#' extdf <- pred_extract(data = efidata, raster = wcd,
#'                       lat = 'decimalLatitude', lon = 'decimalLongitude',
#'                      colsp = "scientificName",
#'                      list = TRUE,verbose = FALSE,
#'                      minpts = 6,merge = FALSE)#basin removed
#'
#'  #outlier detection
#'
#'outliersdf <- multidetect(data = extdf, output='outlier', var = 'bio6',
#'                           exclude = c('x','y'), multiple = TRUE,
#'                           methods = c('mixediqr', "iqr", "mahal", "iqr", "logboxplot"))
#'
#' ociss <- ocindex(x = outliersdf, sp= 1, threshold = 0.2, absolute = TRUE)#
#' #No outliers detected in more than two methods
#'
#'}
#'
ocindex <- function(x, sp = NULL, threshold = NULL, absolute=FALSE, props=FALSE, warn = FALSE, autothreshold=FALSE){

  if(!is(x, 'datacleaner')) stop('Datacleaner class accepted')

  if(x@out!='outlier') stop('Only extracts outliers yet clean data has been produced.')

  if(is.null(threshold) && autothreshold == FALSE) stop("Either set the threshold value or change autothreshold parameter to TRUE.")

  if(!is.null(threshold) && isTRUE(autothreshold)) stop("Either set threshold value (from 0 to 1) or autothreshold to TRUE but not both.")

  if(!is.null(threshold)) if(threshold>1 | threshold<0) stop('threshold must range from 0 to 1.')

  #Get the variable used in outlier detection

  var <- x@varused

  if(length(var)>1) {

  if(is.numeric(sp)) {

    if(sp > length(var))stop("The index number provided for sp is out of bounds, the index number sholud not exceed ", length(x@result), ".", call. = FALSE)

    var <- var[sp]

  }else{
    if(length(sp)>1 || !is.atomic(sp) || !is.character(sp))stop("Indicate only one variable and must character string selected from ", paste(var, collapse = ','))

    if(isTRUE(sp%in%var)) var <- sp else stop("The ", sp, " is not among the variables used during outlier detection, select from ", paste(var, collapse = ','))
  }
}else {
    var
  }

  #For multiple species

  if(isTRUE(x@mode) == TRUE && !is.null(sp)){

    species <- x@result[[sp]]

  }else if(isTRUE(x@mode) == FALSE && is.null(sp)){

    species <- x@result

  }else{
    stop('If they are  multiple species, indicate a particular species name or index')
  }

  #first check for null values for methods that were not successful

  checknull <- sapply(species, nrow)

  #remove methods that didn't execute

  speciesNULL <- species[!sapply(checknull,is.null)]

  #check if any method returned no outliers but will be retained while computing absolute outliers.
  len <- sapply(speciesNULL, nrow)

  if(any(len==0)) y <- speciesNULL[len !=0] else y <-  speciesNULL #replace the list with empty data with y

  if(length(y)<2) stop('The methods with outliers are less than 2.')

  lstvec <- list()

  for (ciii in names(y)) {

    lstvec[[ciii]] <- sort(y[[ciii]][[var]])
  }

  #create a vector of all outliers from all methods

  all_outliers <- unique(do.call(c, lstvec))

  absoluteoutlier_matrix <- matrix(NA, nrow = length(all_outliers), ncol = length(lstvec))

  for (civ in seq_along(lstvec)) {

    lsts <- lstvec[[civ]]

    for (cv in seq_along(all_outliers)) {

      numbs <- all_outliers[cv]

      absoluteoutlier_matrix[cv, civ] <- ifelse(numbs%in%lsts, 1, 0)
    }
  }
  #compute the outlier percentage presence across the methods
  outlier_propn <- c()

  outlier_value <- c()
  #loop through the matrix with 1 for presence and 0 for absent in a particular method

  for (cvi in 1:nrow(absoluteoutlier_matrix)) {

    #instead of using only the methods with outliers, outliers are weighted across all methods

    #therefore the denominator is length(x) where x is the count of methods considered

    outlier_propn[cvi] <- sum(absoluteoutlier_matrix[cvi,])/length(speciesNULL)

    outlier_value[cvi] <- all_outliers[cvi] # identifies the real outlier value
  }


  if(autothreshold == TRUE){

    #search from >= 0.5 to 1, which threshold had the highest mean absolute outliers.

    thresholdval <- c(0.51, 0.6, 0.7, 0.8, 0.9, 1)

    numbofabs <- sapply(X = thresholdval, function(x) length(which(outlier_propn>= x)))

    #assuming the indices are more than 2

    checkZeros <- sapply(numbofabs, function(x) ifelse(x==0, 0, 1)) #If the value is 0 == YES  else No

    indzero <- which(checkZeros==0)

    if(all(checkZeros==0)==TRUE){

      stop("No absolute outliers for thresholds ranging from 0.51 to 1", call. = FALSE)

    }else if(length(indzero)>=4) { #no need to use mean

      bestthreshold <- thresholdval[1] #select the 0.51 because probably returned absolute outliers/lowest cutoff

    }else{
      diffmean <- sapply(numbofabs, function(x) abs(x-mean(numbofabs)))

      indx <- which(diffmean== min(diffmean))

      if(length(indx)>1) indx2 <- indx[1] else indx2 <- indx

      bestthreshold = thresholdval[indx2]
    }
    ido <- which(outlier_propn>= bestthreshold) #filter out only indices of absolute outliers at a best threshold

    absolute_outliers <- outlier_value[ido] #absolute outlier real values

    absolute_propn <- outlier_propn[ido]

    out <- oci(absoluteoutliers = absolute_outliers, threshold = bestthreshold, absolute_propn = absolute_propn,
               listofmethods = lstvec, absolute = absolute, props= props, autothreshold = autothreshold)
  }else{

    ido <- which(outlier_propn>= threshold) #filter out only indices of absolute outliers at a certain threshold

    absolute_outliers <- outlier_value[ido] #absolute outlier real values

    absolute_propn <- outlier_propn[ido]

    if(length(absolute_outliers)>=1 && threshold>=0.5){

      out <- oci(absoluteoutliers = absolute_outliers, threshold = threshold, absolute_propn = absolute_propn,
                 listofmethods = lstvec, absolute = absolute, props= props, autothreshold = autothreshold)

    }else if(length(absolute_outliers)>=1 && threshold<0.5){

      if(isTRUE(warn))warning('The absolute outliers found are suspicious since they are shared by less than 50% of the methods used.')

      out <- oci(absoluteoutliers = absolute_outliers, threshold = threshold, absolute_propn = absolute_propn,
                 listofmethods = lstvec, absolute = absolute, props= props, autothreshold = autothreshold)

    }else{
      stop('No absolute outliers found with a threshold of ', threshold, '. Reduce and try again or continue with the reference dataset.')
    }
  }
}


#' @title  Identifies absolute outliers for multiple species.
#'
#' @inheritParams ocindex
#'
#' @return vector or absolute outliers, best outlier detection method or data frame of absolute outliers and their
#' proportions
#'
#' @export
#'
#' @examples
#'
#' \donttest{
#'
#' data(efidata)
#'
#' danube <- system.file('extdata/danube.shp.zip', package='specleanr')
#'
#' db <- sf::st_read(danube, quiet=TRUE)
#'
#' wcd <- terra::rast(system.file('extdata/worldclim.tiff', package = "specleanr"))
#'
#' extdf <- pred_extract(data = efidata, raster = wcd,
#'                       lat = 'decimalLatitude', lon = 'decimalLongitude',
#'                      colsp = "scientificName",
#'                      list = TRUE,verbose = FALSE,
#'                      minpts = 6,merge = FALSE)#basin removed
#'
#'  #outlier detection
#'outliersdf <- multidetect(data = extdf, output='outlier', var = 'bio6',
#'                           exclude = c('x','y'), multiple = TRUE,
#'                           methods = c('mixediqr', "iqr", "mahal", "iqr", "logboxplot"))
#'
#' totabs_counts <- multiabsolute(x = outliersdf, threshold = 0.2)
#'
#' }
#'
#' @seealso \code{\link{ocindex}}
#'

multiabsolute <- function(x, threshold = NULL, props = FALSE, warn = FALSE, autothreshold = FALSE){

  if(!is(x, 'datacleaner')) stop('Only datacleaner class accepted')

  if(x@out!='outlier') stop('Only extracts outliers yet clean data has been produced.')

  if(is.null(threshold) && autothreshold == FALSE) stop("Either set the threshold value or change `autothreshold` parameter to TRUE.")

  if(!is.null(threshold) && isTRUE(autothreshold)) stop("Either set threshold value (from 0 to 1) or autothreshold to TRUE but not both.")

  if(!is.null(threshold)) if(threshold>1 | threshold<0) stop('threshold must range from 0 to 1.')

  if(x@mode==FALSE)stop("Instead use ocindex when single dataset or parameters considered.")

  absoluteoutliers <- c() #Absolute outliers count

  species <- c()

  #to extract the number of absolute outlier flagged by each method at a particular threshold

  if(isFALSE(props)){

    for (di in 1:length(x@result)) {

      absx <- tryCatch(
        expr = {
          if(autothreshold == TRUE){
            abscount <- ocindex(x= x, sp = di, absolute = TRUE, threshold = threshold, props = FALSE, warn=warn, autothreshold = TRUE)
          }else{

            abscount <- ocindex(x= x, sp = di, absolute = TRUE, threshold = threshold, props = FALSE, warn=warn, autothreshold = FALSE )

          }
        },
        error= function(e){

          if(isTRUE(warn))warning('No absolute outliers exist for species ', names(x@result)[di], '.')

          return(NULL)
        })

      if(length(absx) >0){

        absoluteoutliers[di] <- if(autothreshold ==TRUE) length(abscount[[2]]) else length(abscount)

      }else{
        absoluteoutliers[di] <- 0
      }
      species[di]<- names(x@result)[di]
    }

    dx <- data.frame(groups = species, absoutliers = absoluteoutliers)

    #to extract a dataframe of absolute outlier proportions for all the methods at a particular threshold
  }else if(isTRUE(props)){

    splist <- list()

    for (di in 1:length(x@result)) {

      absx <- tryCatch(
        expr = {

          absprops <- ocindex(x= x, sp = di, absolute = TRUE, threshold = threshold,
                              props = TRUE, warn=warn, autothreshold = autothreshold)

        },
        error= function(e){
          if(isTRUE(warn))warning('No absolute outliers exist for species ', names(x@result)[di], '.', call. = FALSE)
          return(NULL)
        })
      if(length(absx)>0 ){
        splist[[di]] <- absprops
        splist[[di]]$threshold <- threshold
        splist[[di]]$groups <- names(x@result)[di]
        splist[[di]]$numabs <- nrow(absprops)
      }else{
        splist[[di]] <- NULL
      }
    }
    lext <- sapply(splist, length)

    if(any(lext==0)) extfinal <- splist[lext !=0] else extfinal<- splist

    dx <- do.call(rbind, extfinal)

  }else{
    stop('Choose either FALSE or TRUE for props to extract proportion or absolute outliers')
  }
  return(dx)
}


# Presence absence dissimilarity methods

#' @title Identifies the best outlier detection method using Jaccard coefficient.
#'
#' @inheritParams ocindex
#'
#' @return \code{string} best method for identifying outliers.
#'
#' @export
#'
#' @examples
#'
#'\donttest{
#' data(efidata)
#'
#' danube <- system.file('extdata/danube.shp.zip', package='specleanr')
#'
#' db <- sf::st_read(danube, quiet=TRUE)
#'
#' wcd <- terra::rast(system.file('extdata/worldclim.tiff', package = "specleanr"))
#'
#' extdf <- pred_extract(data = efidata, raster = wcd,
#'                     lat = 'decimalLatitude', lon = 'decimalLongitude',
#'                      colsp = "scientificName",
#'                      list = TRUE,verbose = FALSE,
#'                      minpts = 6,merge = FALSE)#basin removed
#'
#'  #outlier detection
#'
#' outliersdf <- multidetect(data = extdf, output='outlier', var = 'bio6',
#'                          exclude = c('x','y'), multiple = TRUE,
#'                          methods = c('mixediqr', "iqr", "mahal", "iqr", "logboxplot"))
#'
#' jaccardout <- jaccard(x = outliersdf, sp= 1, threshold = 0.2)#
#'
#'}
#'
jaccard <- function(x, sp = NULL, threshold = NULL, warn=FALSE, autothreshold=FALSE){


  #check if there are absolute outliers
  if(autothreshold==TRUE){

    absoutliers_list <- ocindex(x= x, sp = sp,  absolute = TRUE, threshold = threshold, warn = warn,
                                autothreshold = autothreshold)
    absoutliers <- absoutliers_list[[2]]

  }else{
    absoutliers <- ocindex(x= x, sp = sp, absolute = TRUE, threshold = threshold, warn = warn)
  }

  var <- x@varused
  if(length(var)>1) var <- sp else var

  if(x@mode == TRUE && !is.null(sp)){

    if(length(absoutliers)>=1)  species <- x@result[[sp]] else stop('No outliers')

  }else if(x@mode == FALSE && is.null(sp)){

    if(length(absoutliers)>=1) species <- x@result else stop('No outliers')

  }else{
    stop('If they are  multiple species, indicate a particular species name or index')
  }

  checkNA <- sapply(species, nrow)

  #remove methods that didnot execute
  speciesNULL <- species[!sapply(checkNA,is.null)]

  len <- sapply(speciesNULL, nrow) #check if any method returned no outliers

  if(any(len==0)) y <- speciesNULL[len !=0] else y <- speciesNULL #replace the list with empty data with y

  matj <- matrix(NA, nrow = length(y), ncol = length(y))

  for (cxii in seq_along(y)) {
    m1 <- y[[cxii]][[var]]

    for (cxiii in seq_along(y)) {

      m2 <- y[[cxiii]][[var]]

      intx <- intersect(m1, m2)

      unx <- union(m1, m2)

      matj[cxii, cxiii] <- (length(intx)/length(unx))*100

    }
  }
  #choose best method with highest mean Jaccard index value
  jdx <- c()

  for (bi in 1:nrow(matj)) {

    jadvalues <- matj[,bi]

    jdx[bi] <- mean(jadvalues)
  }
  indxj <- which(jdx==max(jdx))

  if(length(indxj)>1) indxj <- indxj[1]

  bestmethodj <- names(y)[indxj]

  return(bestmethodj)
}

#
#  @title Overlap coefficient (Szymkiewicz–Simpson coefficient)
#
#' Identifies best outlier detection method using Overlap coefficient.
#'
#' @inheritParams ocindex
#'
#' @return best method for identifying outliers.
#'
#' @export
#'
#' @examples
#'
#'\donttest{
#' data(efidata)
#'
#' danube <- system.file('extdata/danube.shp.zip', package='specleanr')
#'
#' db <- sf::st_read(danube, quiet=TRUE)
#'
#' wcd <- terra::rast(system.file('extdata/worldclim.tiff', package = "specleanr"))
#'
#' extdf <- pred_extract(data = efidata, raster = wcd,
#'                     lat = 'decimalLatitude', lon = 'decimalLongitude',
#'                      colsp = "scientificName",
#'                      list = TRUE,verbose = FALSE,
#'                      minpts = 6,merge = FALSE)#basin removed
#'
#'  #outlier detection
#'
#' outliersdf <- multidetect(data = extdf, output='outlier', var = 'bio6',
#'                          exclude = c('x','y'), multiple = TRUE,
#'                          methods = c('mixediqr', "iqr", "mahal", "iqr", "logboxplot"))
#'
#' overlapout <- overlap(x = outliersdf, sp= 1, threshold = 0.2)#
#'}
#'
overlap <- function(x, sp = NULL, threshold = NULL, warn=FALSE, autothreshold = FALSE){


  if(isTRUE(autothreshold)){
    absoutliers_list <- ocindex(x= x, sp = sp, absolute = TRUE, threshold = threshold, warn = warn,
                                autothreshold = autothreshold)
    absoutliers <- absoutliers_list[[2]]
  }else{
    absoutliers <- ocindex(x= x, sp = sp,  absolute = TRUE, threshold = threshold, warn = warn)
  }

  var <- x@varused
  if(length(var)>1) var <- sp else var

  if(x@mode == TRUE && !is.null(sp)){

    if(length(absoutliers)>=1)  species <- x@result[[sp]] else stop('No outliers')

  }else if(x@mode == FALSE && is.null(sp)){

    if(length(absoutliers)>=1) species <- x@result else stop('No outliers')

  }else{
    stop('If they are  multiple species, indicate a particular species name or index')
  }
  checkNA <- sapply(species, nrow)

  #remove methods that didnot execute
  speciesNULL <- species[!sapply(checkNA,is.null)]

  len <- sapply(speciesNULL, nrow) #check if any method returned no outliers

  if(any(len==0)) y <- speciesNULL[len !=0] else y <- speciesNULL #replace the list with empty data with y

  matover <- matrix(NA, nrow = length(y), ncol = length(y))

  for (cxiv in seq_along(y)) {

    m3 <- y[[cxiv]][[var]]

    for (cxv in seq_along(y)) {

      m4 <- y[[cxv]][[var]]

      intx1 <- intersect(m3, m4)

      len1 <- length(unique(m3))

      len2 <- length(unique(m4))

      matover[cxiv, cxv] <- (length(intx1)/min(len1, len2))*100
    }
  }
  jdo <- c()

  for (bii in 1:nrow(matover)) {

    ovalues <- matover[,bii]

    jdo[bii] <- mean(ovalues)
  }
  indxo <- which(jdo==max(jdo))

  if(length(indxo)>1) indxo <- indxo[1]

  bestmethodo <- names(y)[indxo]

  return(bestmethodo)
}


#' @title Cosine similarity index based on (Gautam & Kulkarni 2014; Joy & Renumol 2020)
#
#' @inheritParams ocindex
#'
#' @return best method for identifying outliers.
#'
#' @export
#'
#' @examples
#'
#' \donttest{
#'
#' data(efidata)
#'
#' danube <- system.file('extdata/danube.shp.zip', package='specleanr')
#'
#' db <- sf::st_read(danube, quiet=TRUE)
#'
#' wcd <- terra::rast(system.file('extdata/worldclim.tiff', package = "specleanr"))
#'
#' extdf <- pred_extract(data = efidata, raster = wcd,
#'                     lat = 'decimalLatitude', lon = 'decimalLongitude',
#'                      colsp = "scientificName",
#'                      list = TRUE,verbose = FALSE,
#'                      minpts = 6,merge = FALSE)#basin removed
#'
#'  #outlier detection
#' outliersdf <- multidetect(data = extdf, output='outlier', var = 'bio6',
#'                          exclude = c('x','y'), multiple = TRUE,
#'                          methods = c('mixediqr', "iqr", "mahal", "iqr", "logboxplot"))
#'
#' consineout <- cosine(x = outliersdf, sp= 1, threshold = 0.2)#
#' }
#'
cosine <- function(x, sp = NULL,threshold = NULL, warn=FALSE, autothreshold = FALSE){

  #check if there are absolute outliers
  if(isTRUE(autothreshold)){
    absoutliers_list <- ocindex(x= x, sp = sp, absolute = TRUE, threshold = threshold, warn = warn,
                                autothreshold = autothreshold)
    absoutliers <- absoutliers_list[[2]]
  }else{
    absoutliers <- ocindex(x= x, sp = sp, absolute = TRUE, threshold = threshold, warn = warn)
  }

  var <- x@varused
  if(length(var)>1) var <- sp else var


  if(x@mode == TRUE && !is.null(sp)){

    if(length(absoutliers)>=1)  species <- x@result[[sp]] else stop('No outliers')

  }else if(x@mode == FALSE && is.null(sp)){

    if(length(absoutliers)>=1) species <- x@result else stop('No outliers')

  }else{
    stop('If they are  multiple species, indicate a particular species name or index')
  }
  checkNA <- sapply(species, nrow)

  #remove methods that didnot execute
  speciesNULL <- species[!sapply(checkNA,is.null)]

  len <- sapply(speciesNULL, nrow) #check if any method returned no outliers

  if(any(len==0)) y <- speciesNULL[len !=0] else y <- speciesNULL #replace the list with empty data with y

  matcos <- matrix(NA, nrow = length(y), ncol = length(y))

  for (cxl in seq_along(y)) {

    m10 <- y[[cxl]][[var]]

    for (cxli in seq_along(y)) {

      m11 <- y[[cxli]][[var]]

      intx12 <- intersect(m10, m11)

      len11 <- sqrt(length(unique(m10)))

      len12 <- sqrt(length(unique(m11)))

      matcos[cxl, cxli] <- (length(intx12)/(len11*len12))*100
    }
  }
  jdcos <- c()

  for (bix in 1:nrow(matcos)) {

    cosv <- matcos[,bix]

    jdcos[bix] <- mean(cosv)
  }
  indxcos <- which(jdcos==max(jdcos))

  if(length(indxcos)>1) indxcos <- indxcos[1]

  bestmethodcos <- names(y)[indxcos]

  return(bestmethodcos)
}

#' @title Identifies best outlier detection method suing Sorensen Similarity Index.
#'
#' @inheritParams ocindex
#'
#' @return best method for identifying outliers.
#'
#' @export
#'
#' @examples
#'
#' \donttest{
#'
#' data(efidata)
#'
#' danube <- system.file('extdata/danube.shp.zip', package='specleanr')
#'
#' db <- sf::st_read(danube, quiet=TRUE)
#'
#' wcd <- terra::rast(system.file('extdata/worldclim.tiff', package = "specleanr"))
#'
#' extdf <- pred_extract(data = efidata, raster = wcd,
#'                       lat = 'decimalLatitude', lon = 'decimalLongitude',
#'                      colsp = "scientificName",
#'                      list = TRUE,verbose = FALSE,
#'                      minpts = 6,merge = FALSE)#basin removed
#'
#'  #outlier detection
#'
#' outliersdf <- multidetect(data = extdf, output='outlier', var = 'bio6',
#'                          exclude = c('x','y'), multiple = TRUE,
#'                          methods = c('mixediqr', "iqr", "mahal", "iqr", "logboxplot"))
#'
#' sordata <- sorensen(x = outliersdf, sp= 1, threshold = 0.2)#
#' }
#'
#'
sorensen <- function(x, sp = NULL,  threshold=NULL, warn=FALSE, autothreshold = FALSE ){


  if(isTRUE(autothreshold)){
    absoutliers_list <- ocindex(x= x, sp = sp,  absolute = TRUE, threshold = threshold, warn = warn,
                                autothreshold = autothreshold)
    absoutliers <- absoutliers_list[[2]]
  }else{
    absoutliers <- ocindex(x= x, sp = sp,  absolute = TRUE, threshold = threshold, warn = warn)
  }

  var <- x@varused
  if(length(var)>1) var <- sp else var

  if(x@mode == TRUE && !is.null(sp)){

    if(length(absoutliers)>=1)  species <- x@result[[sp]] else stop('No outliers')

  }else if(x@mode == FALSE && is.null(sp)){

    if(length(absoutliers)>=1) species <- x@result else stop('No outliers')

  }else{
    stop('If they are  multiple species, indicate a particular species name or index')
  }

  checkNA <- sapply(species, nrow)

  #remove methods that didnot execute
  speciesNULL <- species[!sapply(checkNA,is.null)]

  len <- sapply(speciesNULL, nrow) #check if any method returned no outliers

  if(any(len==0)) y <- speciesNULL[len !=0] else y <- speciesNULL #replace the list with empty data with y

  matsor <- matrix(NA, nrow = length(y), ncol = length(y))

  for (cxvi in seq_along(y)) {

    m5 <- y[[cxvi]][[var]]

    for (cxvii in seq_along(y)) {

      m6 <- y[[cxvii]][[var]]

      intx2 <- intersect(m5, m6)

      len5 <- length(unique(m5))

      len6 <- length(unique(m6))

      matsor[cxvi, cxvii] <- ((2*length(intx2))/sum(len5, len6))*100
    }
  }
  jdsor <- c()

  for (biii in 1:nrow(matsor)) {

    sorval <- matsor[,biii]

    jdsor[biii] <- mean(sorval)
  }
  indxs <- which(jdsor==max(jdsor))

  if(length(indxs)>1) indxs <- indxs[1]

  bestmethodsor <- names(y)[indxs]

  return(bestmethodsor)
}


#Simple matching coefficient

#' @title Identify best outlier detection method using simple matching coefficient.
#'
#' @inheritParams ocindex
#'
#' @return best method for identifying outliers based on simple matching coefficient.
#' @export
#'
#' @examples
#'
#' \donttest{
#'
#' data(efidata)
#'
#' danube <- system.file('extdata/danube.shp.zip', package='specleanr')
#'
#' db <- sf::st_read(danube, quiet=TRUE)
#'
#' wcd <- terra::rast(system.file('extdata/worldclim.tiff', package = "specleanr"))
#'
#' extdf <- pred_extract(data = efidata, raster = wcd,
#'                       lat = 'decimalLatitude', lon = 'decimalLongitude',
#'                      colsp = "scientificName",
#'                      list = TRUE,verbose = FALSE,
#'                      minpts = 6,merge = FALSE)#basin removed
#'
#'#outlier detection
#'
#' outliersdf <- multidetect(data = extdf, output='outlier', var = 'bio6',
#'                          exclude = c('x','y'), multiple = TRUE,
#'                          methods = c('mixediqr', "iqr", "mahal", "iqr", "logboxplot"))
#'
#' smcout <- smc(x = outliersdf, sp= 1, threshold = 0.2)#
#' }
#'
smc <- function(x, sp=NULL,  threshold = NULL, warn = FALSE, autothreshold = FALSE){

  #check if there are absolute outliers
  if(isTRUE(autothreshold)){

    absoutliers_list <- ocindex(x= x, sp = sp, absolute = TRUE, threshold = threshold, warn = warn,
                                autothreshold = autothreshold)
    absoutliers <- absoutliers_list[[2]]
  }else{
    absoutliers <- ocindex(x= x, sp = sp,  absolute = TRUE, threshold = threshold, warn = warn)
  }

  var <- x@varused
  if(length(var)>1) var <- sp else var

  if(x@mode == TRUE && !is.null(sp)){

    if(length(absoutliers)>=1)  species <- x@result[[sp]] else stop('No outliers')

  }else if(x@mode == FALSE && is.null(sp)){

    if(length(absoutliers)>=1) species <- x@result else stop('No outliers')

  }else{
    stop('If they are  multiple species, indicate a particular species name or index')
  }

  checkNA <- sapply(species, nrow)

  #remove methods that didnot execute
  speciesNULL <- species[!sapply(checkNA,is.null)]

  len <- sapply(speciesNULL, nrow) #check if any method returned no outliers

  if(any(len==0)) y <- speciesNULL[len !=0] else y <- speciesNULL #replace the list with empty data with y

  lstvec_sm <- list()

  for (cxx in names(y)) {

    lstvec_sm[[cxx]] <- sort(y[[cxx]][[var]])
  }

  #Compute the values that are absent in methods in a pairwise format

  #step 1. create a vector of all outliers from all methods
  all_outliers_sm <- unique(do.call(c, lstvec_sm))

  #step 2. Determine where each outlier is present or absent

  matsim <-matrix(NA, nrow = length(all_outliers_sm), ncol = length(lstvec_sm))

  for (cxxi in seq_along(lstvec_sm)) {

    lsts_sim <- lstvec_sm[[cxxi]]

    for (cxxii in seq_along(all_outliers_sm)) {

      simpm <- all_outliers_sm[cxxii]

      matsim[cxxii, cxxi] <- ifelse(simpm%in%lsts_sim, 1, 0)
    }
  }

  #step 3. compute the second matrix but identifying where each method has absences 0 and convert it to 1 for easy counting
  #use matrix columns not rows in step 2
  matrix00 <- matrix(NA, nrow = ncol(matsim), ncol = ncol(matsim))#initiate matrix
  matrix11 <- matrix(NA, nrow = ncol(matsim), ncol = ncol(matsim))#initiate matrix
  matrix01 <- matrix(NA, nrow = ncol(matsim), ncol = ncol(matsim))#initiate matrix
  matrix10 <- matrix(NA, nrow = ncol(matsim), ncol = ncol(matsim))#initiate matrix

  m00values <- c() #absences
  m11values <- c() #intersections
  m01values <- c() #only in A
  m10values <- c() #Only in B

  for (cxxiii in 1:ncol(matsim)) {

    rowss <- matsim[,cxxiii] #col1

    for (cxxiv in 1:ncol(matsim)) {

      colss <- matsim[,cxxiv] #vs col2

      for (cxxv in seq_along(rowss)) {

        outv <- rowss[cxxv] # each value in the outer column

        inv <- colss[cxxv] # each value in the outer column

        m00values[cxxv] <- ifelse(outv==0 && inv==0, 1, 0) #convert the absences to 1 for easy summation

        m11values[cxxv] <- ifelse(outv==1 && inv==1, 1, 0)

        m01values[cxxv] <- ifelse(outv==0 && inv==1, 1, 0)

        m10values[cxxv] <- ifelse(outv==1 && inv==0, 1, 0)
      }

      matrix00[cxxiii, cxxiv] <- sum(m00values)
      matrix11[cxxiii, cxxiv] <- sum(m11values)
      matrix01[cxxiii, cxxiv] <- sum(m01values)
      matrix10[cxxiii, cxxiv] <- sum(m10values)
    }
  }

  #step 5. compute the simple matching

  simfinal <- c()
  smc_v <- c()

  for (cxxiv in 1:ncol(matrix00)) {

    absencecol <- matrix00[,cxxiv]

    intscol <- matrix11[, cxxiv]

    metcolin <- matrix01[, cxxiv]

    metcolout <- matrix10[, cxxiv]


    for (cxxv in seq_along(absencecol)) {

      m00 <- absencecol[cxxv]
      m11 <- intscol[cxxv]
      m01 <- metcolin[cxxv]
      m10 <- metcolout[cxxv]

      smc_v[cxxv] <- (m00+m11)/(m00+m11+m01+m10)

    }

    simfinal[cxxiv] <- mean(smc_v)

  }

  indxsim <- which(simfinal==max(simfinal))

  if(length(indxsim)>1) indxsim <- indxsim[1]

  bestmethodsim <- names(y)[indxsim]

  return(bestmethodsim)
}


#' @title Identify best outlier detection method using Hamming distance.
#'
#' @inheritParams ocindex
#'
#' @return best method based on hamming distance
#' @export
#'
#' @examples
#'\donttest{
#' data(efidata)
#'
#' danube <- system.file('extdata/danube.shp.zip', package='specleanr')
#'
#' db <- sf::st_read(danube, quiet=TRUE)
#'
#' wcd <- terra::rast(system.file('extdata/worldclim.tiff', package = "specleanr"))
#'
#' extdf <- pred_extract(data = efidata, raster = wcd,
#'                     lat = 'decimalLatitude', lon = 'decimalLongitude',
#'                      colsp = "scientificName",
#'                      list = TRUE,verbose = FALSE,
#'                      minpts = 6,merge = FALSE)#basin removed
#'
#'  #outlier detection
#'
#' outliersdf <- multidetect(data = extdf, output='outlier', var = 'bio6',
#'                          exclude = c('x','y'), multiple = TRUE,
#'                          methods = c('mixediqr', "iqr", "mahal", "iqr", "logboxplot"))
#'
#' hamout <- hamming(x = outliersdf, sp= 1, threshold = 0.2)#
#' }
#'
#'
hamming <- function(x, sp=NULL, threshold = NULL, warn = FALSE, autothreshold = FALSE){

  #check if there are absolute outliers
  if(isTRUE(autothreshold)){
    absoutliers_list <- ocindex(x= x, sp = sp, absolute = TRUE, threshold = threshold, warn = warn,
                                autothreshold = autothreshold)
    absoutliers <- absoutliers_list[[2]]
  }else{
    absoutliers <- ocindex(x= x, sp = sp, absolute = TRUE, threshold = threshold, warn = warn)
  }

  var <- x@varused
  if(length(var)>1) var <- sp else var

  if(x@mode == TRUE && !is.null(sp)){

    if(length(absoutliers)>=1)  species <- x@result[[sp]]

  }else if(x@mode == FALSE && is.null(sp)){

    if(length(absoutliers)>=1) species <- x@result

  }else{
    stop('If they are  multiple species, indicate a particular species name or index')
  }

  checkNA <- sapply(species, nrow)

  #remove methods that didn't execute
  speciesNULL <- species[!sapply(checkNA,is.null)]

  len <- sapply(speciesNULL, nrow) #check if any method returned no outliers

  if(any(len==0)) y <- speciesNULL[len !=0] else y <- speciesNULL #replace the list with empty data with y

  lstvec_ham <- list()

  for (l in names(y)) {

    lstvec_ham[[l]] <- sort(y[[l]][[var]])
  }

  #Compute the values that are absent in methods in a pairwise format

  #step 1. create a vector of all outliers from all methods
  all_outliers_ham <- unique(do.call(c, lstvec_ham))

  #step 2. Determine where each outlier is present or absent

  matham <-matrix(NA, nrow = length(all_outliers_ham), ncol = length(lstvec_ham))

  for (li in seq_along(lstvec_ham)) {

    lsts_ham <- lstvec_ham[[li]]
    for (lii in seq_along(all_outliers_ham)) {

      hampm <- all_outliers_ham[lii]

      matham[lii, li] <- ifelse(hampm%in%lsts_ham, 1, 0)
    }
  }

  matcompare <- matrix(NA, nrow = ncol(matham), ncol = ncol(matham))

  for (liii in 1:ncol(matham)) {

    m1 <- matham[,liii]

    for (liv in 1:ncol(matham)) {

      m2 <- matham[,liv]

      matcompare[liii, liv] <- Reduce(x=abs(m1-m2), f='+')/length(m1)

    }

  }

  #best method has the lowest ham difference
  hamfinal <- apply(matcompare, 2, mean)

  indxham <- which(hamfinal==min(hamfinal))

  if(length(indxham)>1) indxham <- indxham[1]

  bestmethodham <- names(y)[indxham]

  return(bestmethodham)
}
Any scripts or data that you put into this service are public.
specleanr documentation built on Nov. 26, 2025, 1:07 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
specleanr
Detecting Environmental Outliers in Data Analysis Pipelines

R/similaritytests.R
In specleanr: Detecting Environmental Outliers in Data Analysis Pipelines

Defines functions hamming smc sorensen cosine overlap jaccard multiabsolute ocindex oci

Documented in cosine hamming jaccard multiabsolute ocindex overlap smc sorensen

Try the specleanr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

specleanr Detecting Environmental Outliers in Data Analysis Pipelines

R/similaritytests.R In specleanr: Detecting Environmental Outliers in Data Analysis Pipelines

Defines functions hamming smc sorensen cosine overlap jaccard multiabsolute ocindex oci

Documented in cosine hamming jaccard multiabsolute ocindex overlap smc sorensen

Try the specleanr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

specleanr
Detecting Environmental Outliers in Data Analysis Pipelines

R/similaritytests.R
In specleanr: Detecting Environmental Outliers in Data Analysis Pipelines