inst/doc/generaloutlier.R

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(specleanr)

## ----Getting iris and adding some outliers------------------------------------

irisdata1 <- iris

#introduce outlier data and NAs

rowsOutNA1 <- data.frame(x= c(344, NA,NA, NA),
                         x2 = c(34, 45, 544, NA), 
                         x3= c(584, 5, 554, NA),
                         x4 = c(575, 4554,474, NA), 
                         x5 =c('setosa', 'setosa', 'setosa', "setosa"))

colnames(rowsOutNA1) <- colnames(irisdata1)


dfinal <- rbind(irisdata1, rowsOutNA1)


## ----outlier detection using iris dataset-------------------------------------

setosadf <- dfinal[dfinal$Species%in%"setosa",c("Sepal.Width", 'Species')]

setosa_outlier_detection <- multidetect(data = setosadf, 
                                var = 'Sepal.Width', 
                                multiple = FALSE,
                                methods = c("adjbox", "iqr", "hampel","jknife",
                                            "seqfences", "mixediqr",
                                            "distboxplot", "semiqr",
                                            "zscore", "logboxplot", "medianrule"),
                       silence_true_errors = FALSE, 
                       missingness = 0.1,
                       sdm = FALSE,
                       na.inform = TRUE)

#extractMethods()


## ----ploting2, fig.width = 6, fig.height= 4, fig.align='center'---------------

ggoutliers(setosa_outlier_detection)



## ----clean data extraction and labelling--------------------------------------

setosa_qc_loess <- extract_clean_data(refdata = setosadf, 
                                      outliers = setosa_outlier_detection, loess = TRUE)

#clean dataset
nrow(setosa_qc_loess)

#reference data
nrow(setosadf)

setosa_qc_labeled <- classify_data(refdata = setosadf, outliers = setosa_outlier_detection)


## ----clean data extraction and labelling_multiple, fig.height=4.1, fig.width=4.5, fig.align='center'----


ggenvironmentalspace(setosa_qc_labeled, 
                     type = '1D',
                     ggxangle = 45, 
                     scalecolor = 'viridis',
                     xhjust = 1,
                     legend_position = 'blank',
                     ylab = "Number of records",
                     xlab = "Outlier labels")


## ----outlier detection using multiple-----------------------------------------

multspp_outlier_detection <- multidetect(data = dfinal, 
                                var = 'Sepal.Width', 
                                multiple = TRUE,
                                var_col = "Species",
                                methods = c("adjbox", "iqr", "hampel","jknife",
                                            "seqfences", "mixediqr",
                                            "distboxplot", "semiqr",
                                            "zscore", "logboxplot", "medianrule"),
                       silence_true_errors = FALSE, 
                       missingness = 0.1,
                       sdm = FALSE,
                       na.inform = TRUE)


## ----ploting_single, fig.width = 6, fig.height= 4, fig.align='center'---------

ggoutliers(multspp_outlier_detection)



## ----clean data extraction and labelling mult---------------------------------

multsp_qc_loess <- extract_clean_data(refdata = dfinal, 
                                      outliers = multspp_outlier_detection,
                                      var_col = 'Species',
                                      loess = TRUE)

#clean dataset
nrow(multsp_qc_loess)

#reference data
nrow(dfinal)

multi_qc_labeled <- classify_data(refdata = dfinal, 
                                      outliers = multspp_outlier_detection,
                                  var_col = 'Species')


## ----visualise data labelling, fig.height=4.5, fig.width=6.5, fig.align='center'----

ggenvironmentalspace(multi_qc_labeled, 
                     type = '1D',
                     ggxangle = 45, 
                     scalecolor = 'viridis',
                     xhjust = 1,
                     legend_position = 'blank',
                     ylab = "Number of records",
                     xlab = "Outlier labels")


## ----multiple variables of interest-------------------------------------------

multivariables <- multidetect(data = dfinal, multiple = TRUE,
                      var = c('Sepal.Length', 'Sepal.Width'), output = 'outlier',
                      var_col = 'Species',
                      methods = c('zscore', 'adjbox',
                                  'logboxplot', 'distboxplot',
                                  'iqr', 'semiqr','seqfences','hampel',
                                  'jknife'), 
                      warn = FALSE,
                      sdm = FALSE)


## ----ploting, fig.width = 6, fig.height= 4, fig.align='center'----------------

ggoutliers(multivariables)



## ----data extraction----------------------------------------------------------
#outliers will be returned to NA for each variable

lenwidth_clean <- extract_clean_data(dfinal, outliers = multivariables, 
                                     var_col = 'Species', outlier_to_NA = TRUE, threshold = 0.8)
nrow(lenwidth_clean)

lenwidth_long <- extract_clean_data(dfinal, outliers = multivariables, 
                                     var_col = 'Species', outlier_to_NA = FALSE, threshold = 0.8)
nrow(lenwidth_long)

Try the specleanr package in your browser

Any scripts or data that you put into this service are public.

specleanr documentation built on Nov. 26, 2025, 1:07 a.m.