fit_mixed_outlier: Mixed Outlier Test
In mlindsk/molic: Multivariate Outlier Detection in Contingency Tables

fit_mixed_outlier

R Documentation

Mixed Outlier Test

Description

A function for outlier detection with mixed, but independen, information

Usage

fit_mixed_outlier(m1, m2)

Arguments

`m1`	An object returned from `fit_outlier`
`m2`	An object returned from `fit_outlier`

Details

It is assumed that the input data to m1 and m2 holds information about the same observation in corresponding rows. Thus, the two datasets must also be of same dimension.

Value

An object of type mixed_outlier with novelty or outlier as child classes. These are used for different purposes. See fit_outlier.

Examples


library(dplyr)
library(ess)  # for fit_components
set.seed(7)   # for reproducibility

## Data

# The components - here microhaplotypes
haps <- tgp_haps[1:5] # only a subset of data is used to exemplify
dat <- tgp_dat %>%
       select(pop_meta, sample_name, all_of(unname(unlist(haps))))

# All the Europeans
eur <- dat %>%
  as_tibble() %>%
  filter(pop_meta == "EUR")

# Extracting the two databases for each copy of the chromosomes
eur_a <- eur %>%
  filter(grepl("a$", sample_name)) %>%
  select(-c(1:2))

eur_b <- eur %>%
  filter(grepl("b$", sample_name)) %>%
  select(-c(1:2))


# Fitting the interaction graphs on the EUR data
ga <- fit_components(eur_a, comp = haps, trace = FALSE)
gb <- fit_components(eur_b, comp = haps, trace = FALSE)

## ---------------------------------------------------------
##                       EXAMPLE 1
##   Testing which observations within data are outliers
## ---------------------------------------------------------

# Only 500 simulations is used here to exeplify
# The default number of simulations is 10,000
m1 <- fit_outlier(eur_a, ga, nsim = 500) # consider using more cores (ncores argument)
m2 <- fit_outlier(eur_b, gb, nsim = 500) # consider using more cores (ncores argument)
m  <- fit_mixed_outlier(m1, m2)
print(m)
plot(m)

outs <- outliers(m)
eur_a_outs <- eur_a[which(outs), ]
eur_b_outs <- eur_b[which(outs), ]

# Retrieving the test statistic for individual observations
x1 <- rbind(eur_a_outs[1, ], eur_b_outs[1, ])
x2 <- rbind(eur_a[1, ], eur_b[1, ])
dev1 <- deviance(m, x1) # falls within the critical region in the plot (the red area)
dev2 <- deviance(m, x2) # falls within the acceptable region in the plot

dev1
dev2

# Retrieving the pvalues
pval(m, dev1)
pval(m, dev2)



 
## ---------------------------------------------------------
##                       EXAMPLE 2
##      Testing if a new observation is an outlier
## ---------------------------------------------------------

# Testing if an American is an outlier in Europe
amr <- dat %>%
  as_tibble() %>%
  filter(pop_meta == "AMR")

z1  <- amr %>%
  filter(grepl("a$", sample_name)) %>% 
  select(unname(unlist(haps))) %>%
  slice(1) %>%
  unlist()

z2  <- amr %>%
  filter(grepl("b$", sample_name)) %>% 
  select(unname(unlist(haps))) %>%
  slice(1) %>%
  unlist()

# Only 500 simulations is used here to exemplify
# The default number of simulations is 10,000
m3 <- fit_outlier(eur_a, ga, z1, nsim = 500) # consider using more cores (ncores argument)
m4 <- fit_outlier(eur_b, gb, z2, nsim = 500) # consider using more cores (ncores argument)
m5 <- fit_mixed_outlier(m3, m4)
print(m5)
plot(m5)

mlindsk/molic documentation built on April 1, 2022, 5:21 p.m.