Nothing
## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----setup, include = FALSE---------------------------------------------------
library(vacuum)
library(ggplot2)
library(magrittr)
library(tidyr)
library(dplyr)
## ----echo = FALSE, fig.width = 7, fig.height = 4------------------------------
# Normal Q-Q plot for Tukey's sample data
data.frame(y = table_1) %>%
ggplot(aes(sample = y)) +
stat_qq() +
stat_qq_line() +
#geom_abline(aes(intercept = 0, slope = 1)) +
labs(title = 'Normal Q-Q plot of sample data')
## ----echo = FALSE, fig.width = 7, fig.height = 4------------------------------
x <- table_1
n <- length(x)
y_split <- median(x)
dat <- data.frame(i = 1:n,
y_sorted = sort(x),
a = a_qnorm(1:n, n),
z = (sort(x) - y_split) / a_qnorm(1:n, n)
)
# find the point with the greatest slope
key_pt <- dat[which.max(dat$z), ]
# plot the points and the slope of the point key point
ggplot(dat, aes(x = a, y = y_sorted)) +
geom_point(color = 'darkgrey') +
geom_point(dat = key_pt, aes(x = a, y = y_sorted), color = 'black') +
geom_point(aes(x = 0, y = y_split), color = 'red') +
# example line
geom_segment(aes(x = 0, y = y_split, xend = key_pt$a, yend = key_pt$y_sorted)) +
labs(title = 'Sorted values of y plotted against theoretical distribution a(y)', y = 'y')
## ----echo = FALSE, , fig.width = 7, fig.height = 4----------------------------
A <- 0
B <- 1.5
tmp <- funop(table_1, A, B)
# need z & special for every point
tmp$z <- (tmp$y - median(tmp$y)) / a_qnorm(tmp$i, n)
tmp$special[is.na(tmp$special)] <- FALSE
# need outer to get special and outer to add up as a factor
tmp$outer <- !tmp$middle
z_split <- attr(tmp, 'z_split')
y_split <- attr(tmp, 'y_split')
extreme_B <- B * z_split
extreme_A <- A * z_split
ggplot(tmp, aes(x = y, y = z)) +
geom_point(aes(color = as.factor(outer + special))) +
geom_hline(yintercept = z_split) +
geom_hline(yintercept = extreme_B,
color = 'blue',
alpha = 0.5) +
geom_vline(xintercept = y_split,
color = 'green3',
alpha = 0.75) +
geom_rect(
xmin = -Inf,
xmax = Inf,
ymin = extreme_B,
ymax = Inf,
alpha = 0.0075,
fill = 'blue'
) +
geom_rect(
xmin = y_split - extreme_A,
xmax = y_split + extreme_A,
ymin = -Inf,
ymax = Inf,
alpha = 0.01,
fill = 'green'
) +
labs(title = 'FUNOP: slope (z) plotted against original values') +
labs(x = 'original values') +
scale_color_manual(
name = 'values',
labels = c(
'excluded',
'undistinguished',
'special attention'
),
values = c('green3', 'black', 'red')
)
## -----------------------------------------------------------------------------
# Table 1 contains example data from Tukey's paper
table_1
result <- funop(table_1)
result
# value of the outliers
result[which(result$special == TRUE), 'y']
## ----echo = FALSE, fig.width = 7, fig.height = 4------------------------------
# cleanse the table
cleansed_t2 <- funor_funom(table_2)
data.frame(
i = 1:length(table_2),
original = as.vector(table_2),
cleansed = as.vector(cleansed_t2)
) %>%
ggplot(aes(x = i)) +
geom_point(aes(y = original), color = 'black') +
geom_segment(aes(
x = i,
y = original,
xend = i,
yend = cleansed
),
color = 'black',
linetype = 'dashed') +
geom_point(aes(y = cleansed), color = 'grey') +
labs(title = 'Changes to Table 2', y = 'value', x = '')
## -----------------------------------------------------------------------------
# create a random dataset, representing a two-way table
set.seed(42)
dat <- matrix(rnorm(16), nrow = 4)
# add some noise
dat[2, 1] <- rnorm(1, mean = 10)
# cleanse the table of outliers
cleansed_dat <- funor_funom(dat)
# look at the two tables, before and after
dat
cleansed_dat
# look at the difference between the two tables
# only one non-zero difference
dat - cleansed_dat
## -----------------------------------------------------------------------------
# convert the output of vacuum_cleaner into a long data frame
dat <- vacuum_cleaner(table_8) %>%
as.data.frame() %>%
dplyr::mutate('row' = dplyr::row_number()) %>%
tidyr::pivot_longer(cols = dplyr::starts_with('V'),
names_to = 'col',
values_to = 'value')
# since the table_8 was a two-way table, the rows are factors
dat$row <- as.factor(dat$row)
# conduct analysis of variance
aov_results <- aov(value ~ row + col, dat)
summary(aov_results)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.