General outlier detection for univariate datasets"

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(specleanr)

Introduction to general data outlier detection

Setting one variable of interest

1. Preparing data

irisdata1 <- iris

#introduce outlier data and NAs

rowsOutNA1 <- data.frame(x= c(344, NA,NA, NA),
                         x2 = c(34, 45, 544, NA), 
                         x3= c(584, 5, 554, NA),
                         x4 = c(575, 4554,474, NA), 
                         x5 =c('setosa', 'setosa', 'setosa', "setosa"))

colnames(rowsOutNA1) <- colnames(irisdata1)


dfinal <- rbind(irisdata1, rowsOutNA1)

Detecting outlier in changed iris dataset

We can only use univariate methods to detect only in in variable such as Sepal.Length or we can exclude the species column and also use multivariate methods such as isolation forest, Mahalanobis outlier detection method or One class support vector machines. To identify the methods allowed in this package, run extractMethod()

NOTE * Because we are considering univariate analysis, the parameter sdm is set to FALSE.

2. Filter out only setosa data before outlier detection

setosadf <- dfinal[dfinal$Species%in%"setosa",c("Sepal.Width", 'Species')]

setosa_outlier_detection <- multidetect(data = setosadf, 
                                var = 'Sepal.Width', 
                                multiple = FALSE,
                                methods = c("adjbox", "iqr", "hampel","jknife",
                                            "seqfences", "mixediqr",
                                            "distboxplot", "semiqr",
                                            "zscore", "logboxplot", "medianrule"),
                       silence_true_errors = FALSE, 
                       missingness = 0.1,
                       sdm = FALSE,
                       na.inform = TRUE)

#extractMethods()

3. Visualize the number of outliers detected by each method

ggoutliers(setosa_outlier_detection)

4 Obtaining quality controlled dataset using loess method or data labeling

setosa_qc_loess <- extract_clean_data(refdata = setosadf, 
                                      outliers = setosa_outlier_detection, loess = TRUE)

#clean dataset
nrow(setosa_qc_loess)

#reference data
nrow(setosadf)

setosa_qc_labeled <- classify_data(refdata = setosadf, outliers = setosa_outlier_detection)

5 Visualize labelled quality controlled dataset

ggenvironmentalspace(setosa_qc_labeled, 
                     type = '1D',
                     ggxangle = 45, 
                     scalecolor = 'viridis',
                     xhjust = 1,
                     legend_position = 'blank',
                     ylab = "Number of records",
                     xlab = "Outlier labels")

For multiple species but using only variable of interest

NOTE

6. Outlier detection across the species groups in iris dataset

multspp_outlier_detection <- multidetect(data = dfinal, 
                                var = 'Sepal.Width', 
                                multiple = TRUE,
                                var_col = "Species",
                                methods = c("adjbox", "iqr", "hampel","jknife",
                                            "seqfences", "mixediqr",
                                            "distboxplot", "semiqr",
                                            "zscore", "logboxplot", "medianrule"),
                       silence_true_errors = FALSE, 
                       missingness = 0.1,
                       sdm = FALSE,
                       na.inform = TRUE)

7 Visualise the number of outliers detected by each method

ggoutliers(multspp_outlier_detection)

8 Obtaining quality controlled dataset using loess method or data labeling

multsp_qc_loess <- extract_clean_data(refdata = dfinal, 
                                      outliers = multspp_outlier_detection,
                                      var_col = 'Species',
                                      loess = TRUE)

#clean dataset
nrow(multsp_qc_loess)

#reference data
nrow(dfinal)

multi_qc_labeled <- classify_data(refdata = dfinal, 
                                      outliers = multspp_outlier_detection,
                                  var_col = 'Species')

10 Visualise labelled quality controlled dataset

ggenvironmentalspace(multi_qc_labeled, 
                     type = '1D',
                     ggxangle = 45, 
                     scalecolor = 'viridis',
                     xhjust = 1,
                     legend_position = 'blank',
                     ylab = "Number of records",
                     xlab = "Outlier labels")

The second approach is setting multiple variables of interest

11. Outlier detection

multivariables <- multidetect(data = dfinal, multiple = TRUE,
                      var = c('Sepal.Length', 'Sepal.Width'), output = 'outlier',
                      var_col = 'Species',
                      methods = c('zscore', 'adjbox',
                                  'logboxplot', 'distboxplot',
                                  'iqr', 'semiqr','seqfences','hampel',
                                  'jknife'), 
                      warn = FALSE,
                      sdm = FALSE)

12. Visualize the number of outliers detected by each method

ggoutliers(multivariables)

13. Data extraction

NOTE

#outliers will be returned to NA for each variable

lenwidth_clean <- extract_clean_data(dfinal, outliers = multivariables, 
                                     var_col = 'Species', outlier_to_NA = TRUE, threshold = 0.8)
nrow(lenwidth_clean)

lenwidth_long <- extract_clean_data(dfinal, outliers = multivariables, 
                                     var_col = 'Species', outlier_to_NA = FALSE, threshold = 0.8)
nrow(lenwidth_long)

The package is undergoing peer review for publication



Try the specleanr package in your browser

Any scripts or data that you put into this service are public.

specleanr documentation built on Nov. 26, 2025, 1:07 a.m.