inst/doc/modi_vignette.R

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ---- echo=FALSE, results='asis'----------------------------------------------
library(modi)
knitr::kable(head(sepe[ ,c("idnr","weight","totinvwp","totinvwm","totinvap","totinvto","totexpwp","totexpwm","totexpap","totexpto")], 10))

## ---- eval = TRUE-------------------------------------------------------------
# attach data
data("sepe")

# recode 0s as NA
sepenozero <- sepe
sepenozero[sepenozero == 0] <- NA

## ---- eval = TRUE-------------------------------------------------------------
# relevant variables
sepevar8 <- c("totinvwp","totinvwm","totinvap","totinvto",
              "totexpwp","totexpwm","totexpap","totexpto")

# log(x + 1) transformation
sepe_transformed <- log(sepenozero[ ,sepevar8] + 1)

# show the first 5 observations
head(sepe_transformed)

## ---- eval = TRUE-------------------------------------------------------------
# run the BEM() algorithm
res.bem <- BEM(sepe_transformed, sepe$weight, c0 = 5)

## ---- eval = TRUE-------------------------------------------------------------
# run the BEM() algorithm with different alpha
res.bem <- BEM(sepe_transformed, sepe$weight, c0 = 5, alpha = 0.01 / nrow(sepe_transformed))

## ---- eval = TRUE, fig.width = 7, fig.height = 5, results = "hide"------------
# QQ-plot of MD vs. F-distribution
PlotMD(res.bem$dist, ncol(sepe_transformed), alpha = 0.95)

## ---- eval = TRUE-------------------------------------------------------------
# find the cutpoint chosen by BEM()
min(res.bem$dist[res.bem$outind])

## ---- eval = TRUE-------------------------------------------------------------
# find outliers with cutpoint 70
outind <- ifelse(res.bem$dist > 70, TRUE, FALSE)

# set NAs to FALSE
outind[is.na(outind)] <- FALSE

# how many outliers are there?
sum(outind)

## ---- eval = TRUE-------------------------------------------------------------
# find cutpoint with fixed number of outliers
quantile(res.bem$dist, 0.95, na.rm = TRUE)

## ---- eval = TRUE, fig.width = 7, fig.height = 5, results = "hide"------------
# create transformed data including zeros
df <- log(sepe[, sepevar8] + 1)

# set up scatterplot totexpto vs. totinvto
plot(df$totinvto, df$totexpto, type = "n", xlab = "Total Inv.", ylab = "Total Exp.")

# plot comparison of outliers determined with visual cutpoint and default cutpoint
points(df$totinvto[!res.bem$outind], df$totexpto[!res.bem$outind])
points(df$totinvto[res.bem$outind], df$totexpto[res.bem$outind], pch = 4, col = "red")
points(df$totinvto[outind], df$totexpto[outind], pch = 0, col = "blue")

## ---- eval = TRUE-------------------------------------------------------------
# apply Winsimp()
res.winsimp <- Winsimp(sepe_transformed, 
                       res.bem$center, 
                       res.bem$scatter, 
                       outind)

## ---- eval = TRUE-------------------------------------------------------------
# get the imputed data
imp_data <- res.winsimp$imputed.data

# indicate the zeros in original dataset
zeros <- ifelse(sepe[ ,sepevar8] == 0, 1, 0)

# redefine NAs as 0
zeros[is.na(zeros)] <- 0

# re-insert zeros
imp_data <- as.data.frame(ifelse(zeros == 1, 0, imp_data))

## ---- eval = TRUE, fig.width = 7, fig.height = 5, results = "hide"------------
# create transformed data including zeros
df <- log(sepe[, sepevar8] + 1)

# set up scatterplot totexpto vs. totinvto
plot(df$totinvto, df$totexpto, type = "n", xlab = "Total Inv.", ylab = "Total Exp.")

# plot comparison of outliers determined with visual cutpoint and default cutpoint
points(df$totinvto[outind], df$totexpto[outind], pch = 0, col = "blue")
points(imp_data$totinvto[outind], imp_data$totexpto[outind], pch = 0, col = "green")

## ---- eval = TRUE-------------------------------------------------------------
# log(x + 1) transformation
sepe_transformed <- log(sepenozero[ ,sepevar8] + 1)

## ---- eval = TRUE-------------------------------------------------------------
# run the TRC() algorithm
res.trc <- TRC(sepe_transformed, sepe$weight)

## ---- eval = TRUE-------------------------------------------------------------
# run the TRC() algorithm
res.trc <- TRC(sepe_transformed, sepe$weight, gamma = 30 / res.trc$sample.size)

## ---- eval = TRUE-------------------------------------------------------------
# find the cutpoint chosen by TRC()
min(res.trc$dist[res.trc$outind])

## ---- eval = TRUE, fig.width = 7, fig.height = 5, results = "hide"------------
# QQ-plot of MD vs. F-distribution
PlotMD(res.trc$dist, ncol(sepe_transformed))

## ---- eval = TRUE-------------------------------------------------------------
# find outliers with cutpoint 70
outind <- ifelse(res.trc$dist > 210, TRUE, FALSE)

# set NAs to FALSE
outind[is.na(outind)] <- FALSE

# how many outliers are there?
sum(outind)

## ---- eval = TRUE-------------------------------------------------------------
# run the GIMCD() algorithm
res.gimcd <- GIMCD(sepe_transformed, seedem = 234567819, seedmcd = 4097)

## ---- eval = TRUE-------------------------------------------------------------
# find the cutpoint chosen by GIMCD()
min(res.gimcd$dist[res.gimcd$outind])

## ---- eval = TRUE, fig.width = 7, fig.height = 5, results = "hide"------------
# QQ-plot of MD vs. F-distribution
PlotMD(res.gimcd$dist, ncol(sepe_transformed))

## ---- eval = TRUE-------------------------------------------------------------
# find outliers with cutpoint 70
outind <- ifelse(res.gimcd$dist > 24, TRUE, FALSE)

# set NAs to FALSE
outind[is.na(outind)] <- FALSE

# how many outliers are there?
sum(outind)

## ---- eval = TRUE, fig.width = 7, fig.height = 5------------------------------
# create transformed data including zeros
df <- log(sepe[, sepevar8] + 1)

# run the EAdet() algorithm
res.eadet <- EAdet(df, sepe$weight)

## ---- eval = TRUE-------------------------------------------------------------
# how many outliers?
sum(res.eadet$outind, na.rm = TRUE)

## ---- eval = TRUE-------------------------------------------------------------
# determine outliers based on new cutpoint
outind <- res.eadet$infection.time >= 5

# how many outliers are there?
sum(outind, na.rm = TRUE)

## ---- eval = TRUE-------------------------------------------------------------
# determine outliers based on new cutpoint
res.eaimp <- EAimp(df, sepe$weight, outind = res.eadet$outind, 
                   duration = res.eadet$duration)

Try the modi package in your browser

Any scripts or data that you put into this service are public.

modi documentation built on March 31, 2023, 8:35 p.m.