knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "HRR_simulation_files/figures/HRR-",
  out.width = "100%",
  fig.align = "center"
)

Simulation studies for hazard rate adjuestment

This simulation study contains 3 cases:

  1. Scan on uniform distribution random varialbes with an embedded cluster.
  2. Scan on first order fourier series random varialbes with an embedded cluster, no hazard rate adjuestment.
  3. Scan on first order fourier series random varialbes with an embedded cluster, hazard rate adjuestment.

Pre settings

#---------------------------------- packages ----------------------------------#
library(POT)
library(AnomDetct)
library(ggplot2)
library(parallel)
#----------------------------- predefine functions ----------------------------#
# This check if the estimate probability is valid
check_fun <- function(N, critical_cnt, window_l, p){
  p1 <- 1 - AnomDetct::q1_function(critical_cnt, window_l, p)

  n <- floor(N/window_l-1)
  return(p1 <= 0.025 &
           n>3 &
           3.3*n*p1^2 < 1)
}

# This check if the detected cluster is ture cluster
check_clst <- function(clst_row, x_clst, x_all){
  clst_intv  <- range(x_clst)
  detct_intv <- c(x_all[clst_row[1]],x_all[clst_row[2]])
  return(max(detct_intv[1],clst_intv[1]) < min(detct_intv[2],clst_intv[2]))
}
#----------------------------- parameters setting -----------------------------#
## sample size ##
n <- 10000L

## cluster size ##
n_clst <- 150L

## cos function ##
f_x <- function(x) a*sin(2*pi*x) + 1
F_x <- function(x) x + a/(2*pi)*(1-cos(2*pi*x))

## cos parameter ##
a <- -0.7

## generalized pareto distribution parameter ##
mu <- 0
xi <- 2
sig <- 10

## cluster dist ##
clst_loc <- 0.5
clst_bin <- 0.005

## theta ##
theta_th <- 0.1

## p-value ##
alpha_0 <- 0.05

## simulation repeating time ##
sim_rep <- 5000L

As the size are same cross different cases, critical values only need to be computed once.

#------------------------------ critical values -------------------------------#
min_crit <- AnomDetct::crit_theta_fun(N = n, window_l = n_clst, 
                                      p = 1-exp(-theta_th), alpha = alpha_0)

## check if prob_fun used in crit_theta_fun is reliable ##
if(!check_fun(N = n, critical_cnt = min_crit, 
              window_l = n_clst, p = 1-exp(-theta_th))) stop("Invalid input")


## alpha_th may not be same as alpha_0 as min_crit is discrete ##
alpha_th <- AnomDetct::prob_fun(N = n, k = min_crit, 
                                m = n_clst, p = 1-exp(-theta_th))

Case1

Cluster

case1_res <- readRDS("HRR_simulation_files/RDS/case1_sm_c_res.rds")

case1_fnc_clst generates data in uniform distribution and mix it with cluster data. The function returns a matrix of 3 columns: start,end,if detected cluster intersects true cluster

case1_fnc_clst <- function(i){
  ## generate data ##
  x_base <- runif(n-n_clst)
  x_clst <- runif(n_clst, clst_loc-clst_bin, clst_loc+clst_bin)

  ## cdf x ##
  cdf_x <- sort(c(x_base,x_clst))

  ## No hazard rate ratio adjustment ##
  HRR_fun  <- stats::approxfun(c(0,1), c(1,1))
  HRR      <- HRR_fun(cdf_x)

  ## detect cluster ##
  clst <- AnomDetct::hypo_test(cdf = cdf_x, HRR = HRR, critical_cnt = min_crit,
                               window_l = n_clst, theta = theta_th)

  if(nrow(clst)==0) return(matrix(nrow = 0, ncol = 3,
                                  dimnames = list(NULL,
                                                  c("start","end","clst_is_ture"))))
  clst_is_ture <- mapply(check_clst, 
                         split(clst,seq.int(nrow(clst))),
                         MoreArgs = list(x_clst = x_clst,
                                         x_all  = cdf_x))
  ## change clst from index to real range ##
  clst <- t(apply(clst,1,function(i)cdf_x[i]))

  colnames(clst) <- c("start","end")
  return(cbind(clst,clst_is_ture))
}
## seed ##
RNGkind("L'Ecuyer-CMRG")
set.seed(201911121)

## scan results ##
case1_res <- mclapply(seq.int(sim_rep), case1_fnc_clst)
## true positive ##
TP_clst <- sapply(case1_res, function(x) sum(x[,3])>0)
TP      <- sum(TP_clst)/sim_rep

#------------------------------------ plot ------------------------------------#
## cumulative detected clusters ##
case1_mtx <- do.call(rbind, case1_res)
cnt_fnc <- function(x) sum(case1_mtx[,1] <= x & case1_mtx[,2] >= x)

p_clst <- ggplot(data.frame(x = seq(0,1,by = 0.01)),mapping = aes(x)) + 
  stat_function(fun = function(x)sapply(x,cnt_fnc), geom = "step") +
  scale_y_continuous(limits = c(0,NA))

## plot density function ##
den_fnc <- function(x){
  base_den <- (1-n_clst/n)
  clst_den <- n_clst/n*(clst_loc-clst_bin<=x && x <=clst_loc+clst_bin)/
    (2*clst_bin)
  return(base_den + clst_den)
}

p_den <- ggplot(data.frame(x = seq(0,1,by = 0.001)),mapping = aes(x)) + 
  stat_function(fun = function(x)sapply(x,den_fnc), geom = "step") +
  scale_y_continuous(limits = c(0,NA))

## plot histogram ##
x_base <- POT::rgpd(n-n_clst, loc = mu, scale = sig, shape = xi)
x_clst <- qgpd(runif(n_clst, clst_loc-clst_bin, clst_loc+clst_bin),
               loc = mu, scale = sig, shape = xi)
x_all <- sort(c(x_clst,x_base))

p_hist <- ggplot(data.frame(x = x_all[x_all<100]), mapping = aes(x)) + 
  geom_histogram(binwidth = 2)

print(paste("True positive is",TP, sep = " "))
p_clst
p_den
p_hist

No cluster

case1_res <- readRDS("HRR_simulation_files/RDS/case1_sm_nc_res.rds")
case1_fnc <- function(i){
  ## generate data ##
  x_base <- runif(n)

  ## cdf x ##
  cdf_x <- sort(x_base)

  ## No hazard rate ratio adjustment ##
  HRR_fun  <- stats::approxfun(c(0,1), c(1,1))
  HRR      <- HRR_fun(cdf_x)

  ## detect cluster ##
  clst <- AnomDetct::hypo_test(cdf = cdf_x, HRR = HRR, critical_cnt = min_crit,
                               window_l = n_clst, theta = theta_th)

  if(nrow(clst)==0) return(matrix(nrow = 0, ncol = 2,
                                  dimnames = list(NULL,
                                                  c("start","end"))))

  ## change clst from index to real range ##
  clst <- t(apply(clst,1,function(i)cdf_x[i]))

  colnames(clst) <- c("start","end")
  return(clst)
}
## scan results ##
case1_res <- mclapply(seq.int(sim_rep), case1_fnc)
## false positive ##
FP_clst <- sapply(case1_res,function(x)nrow(x)>0)
FP      <- sum(FP_clst)/sim_rep

#------------------------------------ plot ------------------------------------#
## cumulative detected clusters ##
case1_mtx <- do.call(rbind, case1_res)
cnt_fnc <- function(x) sum(case1_mtx[,1] <= x & case1_mtx[,2] >= x)

p_clst <- ggplot(data.frame(x = seq(0,1,by = 0.01)),mapping = aes(x)) + 
  stat_function(fun = function(x)sapply(x,cnt_fnc), geom = "step")

## plot density function ##
den_fnc <- function(x){
  base_den <- 1
  return(base_den)
}

## plot histogram ##
x_base <- POT::rgpd(n-n_clst, loc = mu, scale = sig, shape = xi)
x_clst <- qgpd(runif(n_clst, clst_loc-clst_bin, clst_loc+clst_bin),
               loc = mu, scale = sig, shape = xi)
x_all <- sort(c(x_clst,x_base))

p_hist <- ggplot(data.frame(x = x_all[x_all<100]), mapping = aes(x)) + 
  geom_histogram(binwidth = 2)

print(paste("False positive is",FP, sep = " "))
p_clst
p_den
p_hist

Case2

Cluster

case2_res <- readRDS("HRR_simulation_files/RDS/case2_sm_c_res.rds")

case2_fnc_clst generates data with fourier first order density function and mix it with cluster data. The function returns a matrix of 3 columns: start,end,if detected cluster intersects true cluster

case2_fnc_clst <- function(i){
  ## generate data ##
  init <- runif(n-n_clst)
  op <- function(inp){
    FCN <- function(x) abs(F_x(x) - inp)
    return(optim(par = 0.5, lower = 0, upper = 1,
                 fn = FCN, method = "L-BFGS-B")$par)
  } 
  y <- unlist(mclapply(init, op))
  ## if y is 0 or 1, replace y with init because sin 0 and 2pi is 1
  x_base <- ifelse(y == 0 | y == 1, init, y)
  x_clst <- runif(n_clst, clst_loc-clst_bin, clst_loc+clst_bin)

  ## cdf x ##
  cdf_x <- sort(c(x_base,x_clst))

  ## No hazard rate ratio adjustment ##
  HRR_fun  <- stats::approxfun(c(0,1), c(1,1))
  HRR      <- HRR_fun(cdf_x)

  ## detect cluster ##
  clst <- AnomDetct::hypo_test(cdf = cdf_x, HRR = HRR, critical_cnt = min_crit,
                               window_l = n_clst, theta = theta_th)

  if(nrow(clst)==0) return(matrix(nrow = 0, ncol = 3,
                                  dimnames = list(NULL,
                                                  c("start","end","clst_is_ture"))))
  clst_is_ture <- mapply(check_clst, 
                         split(clst,seq.int(nrow(clst))),
                         MoreArgs = list(x_clst = x_clst,
                                         x_all  = cdf_x))
  ## change clst from index to real range ##
  clst <- t(apply(clst,1,function(i)cdf_x[i]))

  colnames(clst) <- c("start","end")
  return(cbind(clst,clst_is_ture))
}
## seed ##
RNGkind("L'Ecuyer-CMRG")
set.seed(201911122)

## scan results ##
case2_res <- mclapply(seq.int(sim_rep), case2_fnc_clst)
## true positive ##
TP_clst <- sapply(case2_res, function(x) sum(x[,3])>0)
TP      <- sum(TP_clst)/sim_rep

#------------------------------------ plot ------------------------------------#
## cumulative detected clusters ##
case2_mtx <- do.call(rbind, case2_res)
cnt_fnc <- function(x) sum(case2_mtx[,1] <= x & case2_mtx[,2] >= x)

p_clst <- ggplot(data.frame(x = seq(0,1,by = 0.001)),mapping = aes(x)) + 
  stat_function(fun = function(x)sapply(x,cnt_fnc), geom = "step") +
  scale_y_continuous(limits = c(0,NA))

## plot density function ##
den_fnc <- function(x){
  base_den <- (1-n_clst/n)*f_x(x)
  clst_den <- n_clst/n*(clst_loc-clst_bin<=x && x <=clst_loc+clst_bin)/
    (2*clst_bin)
  return(base_den + clst_den)
}

p_den <- ggplot(data.frame(x = seq(0,1,by = 0.001)),mapping = aes(x)) + 
  stat_function(fun = function(x)sapply(x,den_fnc), geom = "step") +
  ggtitle("Density Function") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_y_continuous(limits = c(0,NA))

## plot histogram ##
init <- runif(n-n_clst)
op <- function(inp){
  FCN <- function(x) abs(F_x(x) - inp)
  return(optim(par = 0.5, lower = 0, upper = 1,
               fn = FCN, method = "L-BFGS-B")$par)
}
y <- unlist(mclapply(init, op))
## if y is 0 or 1, replace y with init because sin 0 and 2pi is 1
x_base <- ifelse(y == 0 | y == 1, init, y)
x_clst <- runif(n_clst, clst_loc-clst_bin, clst_loc+clst_bin)

## cdf x ##
cdf_x <- sort(c(x_base,x_clst))

## x_all ##
x_all <- POT::qgpd(cdf_x, loc = mu, scale = sig, shape = xi)
p_hist <- ggplot(data.frame(x = x_all[x_all<100]), mapping = aes(x)) + 
  geom_histogram(binwidth = 2) + 
  ggtitle("Example of gp+cluster") +
  theme(plot.title = element_text(hjust = 0.5))

print(paste("True positive is",TP, sep = " "))
p_clst
p_den
p_hist

No Cluster

case2_res <- readRDS("HRR_simulation_files/RDS/case2_sm_nc_res.rds")
case2_fnc <- function(i){
  ## generate data ##
  init <- runif(n)
  op <- function(inp){
    FCN <- function(x) abs(F_x(x) - inp)
    return(optim(par = 0.5, lower = 0, upper = 1,
                 fn = FCN, method = "L-BFGS-B")$par)
  } 
  y <- unlist(mclapply(init, op))
  ## if y is 0 or 1, replace y with init because sin 0 and 2pi is 1
  x_base <- ifelse(y == 0 | y == 1, init, y)

  ## cdf x ##
  cdf_x <- sort(x_base)

  ## No hazard rate ratio adjustment ##
  HRR_fun  <- stats::approxfun(c(0,1), c(1,1))
  HRR      <- HRR_fun(cdf_x)

  ## detect cluster ##
  clst <- AnomDetct::hypo_test(cdf = cdf_x, HRR = HRR, critical_cnt = min_crit,
                               window_l = n_clst, theta = theta_th)

  if(nrow(clst)==0) return(matrix(nrow = 0, ncol = 2,
                                  dimnames = list(NULL,
                                                  c("start","end"))))

  ## change clst from index to real range ##
  clst <- t(apply(clst,1,function(i)cdf_x[i]))

  colnames(clst) <- c("start","end")
  return(clst)
}
## scan results ##
case2_res <- mclapply(seq.int(sim_rep), case2_fnc)
## false positive ##
FP_clst <- sapply(case2_res,function(x)nrow(x)>0)
FP      <- sum(FP_clst)/sim_rep

#------------------------------------ plot ------------------------------------#
## cumulative detected clusters ##
case2_mtx <- do.call(rbind, case2_res)
cnt_fnc <- function(x) sum(case2_mtx[,1] <= x & case2_mtx[,2] >= x)

p_clst <- ggplot(data.frame(x = seq(0,1,by = 0.001)),mapping = aes(x)) + 
  stat_function(fun = function(x)sapply(x,cnt_fnc), geom = "step") + 
  scale_y_continuous(limits = c(0,sim_rep))

## plot density function ##
den_fnc <- function(x){
  base_den <- f_x(x)
  return(base_den)
}

p_den <- ggplot(data.frame(x = seq(0,1,by = 0.001)),mapping = aes(x)) + 
  stat_function(fun = function(x)sapply(x,den_fnc), geom = "step") +
  scale_y_continuous(limits = c(0,NA))

## plot histogram ##
x_base <- POT::rgpd(n-n_clst, loc = mu, scale = sig, shape = xi)
x_clst <- qgpd(runif(n_clst, clst_loc-clst_bin, clst_loc+clst_bin),
               loc = mu, scale = sig, shape = xi)
x_all <- sort(c(x_clst,x_base))

p_hist <- ggplot(data.frame(x = x_all[x_all<100]), mapping = aes(x)) + 
  geom_histogram(binwidth = 2)

print(paste("False positive is",FP, sep = " "))
p_clst
p_den
p_hist

Case3

Cluster

case3_res <- readRDS("HRR_simulation_files/RDS/case3_sm_c_res.rds")

case3_fnc_clst generates data with fourier first order density function and mix it with cluster data. The function returns a matrix of 3 columns: start,end,if detected cluster intersects true cluster

case3_fnc_clst <- function(i){
  ## generate data ##
  init <- runif(n-n_clst)
  op <- function(inp){
    FCN <- function(x) abs(F_x(x) - inp)
    return(optim(par = 0.5, lower = 0, upper = 1,
                 fn = FCN, method = "L-BFGS-B")$par)
  } 
  y <- unlist(mclapply(init, op))
  ## if y is 0 or 1, replace y with init because sin 0 and 2pi is 1
  x_base <- ifelse(y == 0 | y == 1, init, y)
  x_clst <- runif(n_clst, clst_loc-clst_bin, clst_loc+clst_bin)

  ## cdf x ##
  cdf_x <- sort(c(x_base,x_clst))

  ## hazard rate ratio adjustment ##
  HRR_fun <- function(x) 1/(1-x)/(f_x(x)/(1-F_x(x)))
  HRR <- HRR_fun(cdf_x)

  ## detect cluster ##
  clst <- AnomDetct::hypo_test(cdf = cdf_x, HRR = HRR, critical_cnt = min_crit,
                               window_l = n_clst, theta = theta_th)

  if(nrow(clst)==0) return(matrix(nrow = 0, ncol = 3,
                                  dimnames = list(NULL,
                                                  c("start","end","clst_is_ture"))))
  clst_is_ture <- mapply(check_clst, 
                         split(clst,seq.int(nrow(clst))),
                         MoreArgs = list(x_clst = x_clst,
                                         x_all  = cdf_x))
  ## change clst from index to real range ##
  clst <- t(apply(clst,1,function(i)cdf_x[i]))

  colnames(clst) <- c("start","end")
  return(cbind(clst,clst_is_ture))
}
## seed ##
RNGkind("L'Ecuyer-CMRG")
set.seed(201911123)

## scan results ##
case3_res <- mclapply(seq.int(sim_rep), case3_fnc_clst)
## true positive ##
TP_clst <- sapply(case3_res, function(x) sum(x[,3])>0)
TP      <- sum(TP_clst)/sim_rep

#------------------------------------ plot ------------------------------------#
## cumulative detected clusters ##
case3_mtx <- do.call(rbind, case3_res)
cnt_fnc <- function(x) sum(case3_mtx[,1] <= x & case3_mtx[,2] >= x)

p_clst <- ggplot(data.frame(x = seq(0,1,by = 0.001)),mapping = aes(x)) + 
  stat_function(fun = function(x)sapply(x,cnt_fnc), geom = "step") +
  scale_y_continuous(limits = c(0,NA))

## plot density function ##
den_fnc <- function(x){
  base_den <- (1-n_clst/n)*f_x(x)
  clst_den <- n_clst/n*(clst_loc-clst_bin<=x && x <=clst_loc+clst_bin)/
    (2*clst_bin)
  return(base_den + clst_den)
}

p_den <- ggplot(data.frame(x = seq(0,1,by = 0.001)),mapping = aes(x)) + 
  stat_function(fun = function(x)sapply(x,den_fnc), geom = "step") +
  ggtitle("Density Function") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_y_continuous(limits = c(0,NA))

## plot histogram ##
init <- runif(n-n_clst)
op <- function(inp){
  FCN <- function(x) abs(F_x(x) - inp)
  return(optim(par = 0.5, lower = 0, upper = 1,
               fn = FCN, method = "L-BFGS-B")$par)
}
y <- unlist(mclapply(init, op))
## if y is 0 or 1, replace y with init because sin 0 and 2pi is 1
x_base <- ifelse(y == 0 | y == 1, init, y)
x_clst <- runif(n_clst, clst_loc-clst_bin, clst_loc+clst_bin)

## cdf x ##
cdf_x <- sort(c(x_base,x_clst))

## x_all ##
x_all <- POT::qgpd(cdf_x, loc = mu, scale = sig, shape = xi)
p_hist <- ggplot(data.frame(x = x_all[x_all<100]), mapping = aes(x)) + 
  geom_histogram(binwidth = 2)

print(paste("True positive is",TP, sep = " "))
p_clst
p_den
p_hist

No Cluster

case3_res <- readRDS("HRR_simulation_files/RDS/case3_sm_nc_res.rds")
case3_fnc <- function(i){
  ## generate data ##
  init <- runif(n)
  op <- function(inp){
    FCN <- function(x) abs(F_x(x) - inp)
    return(optim(par = 0.5, lower = 0, upper = 1,
                 fn = FCN, method = "L-BFGS-B")$par)
  } 
  y <- unlist(mclapply(init, op))
  ## if y is 0 or 1, replace y with init because sin 0 and 2pi is 1
  x_base <- ifelse(y == 0 | y == 1, init, y)

  ## cdf x ##
  cdf_x <- sort(x_base)

  ## hazard rate ratio adjustment ##
  HRR_fun <- function(x) 1/(1-x)/(f_x(x)/(1-F_x(x)))
  HRR <- HRR_fun(cdf_x)

  ## detect cluster ##
  clst <- AnomDetct::hypo_test(cdf = cdf_x, HRR = HRR, critical_cnt = min_crit,
                               window_l = n_clst, theta = theta_th)

  if(nrow(clst)==0) return(matrix(nrow = 0, ncol = 2,
                                  dimnames = list(NULL,
                                                  c("start","end"))))

  ## change clst from index to real range ##
  clst <- t(apply(clst,1,function(i)cdf_x[i]))

  colnames(clst) <- c("start","end")
  return(clst)
}
## scan results ##
system.time({case3_res <- mclapply(seq.int(sim_rep), case3_fnc)})
## false positive ##
FP_clst <- sapply(case3_res,function(x)nrow(x)>0)
FP      <- sum(FP_clst)/sim_rep

#------------------------------------ plot ------------------------------------#
## cumulative detected clusters ##
case3_mtx <- do.call(rbind, case3_res)
cnt_fnc <- function(x) sum(case3_mtx[,1] <= x & case3_mtx[,2] >= x)

p_clst <- ggplot(data.frame(x = seq(0,1,by = 0.001)),mapping = aes(x)) + 
  stat_function(fun = function(x)sapply(x,cnt_fnc), geom = "step")

## plot density function ##
den_fnc <- function(x){
  base_den <- f_x(x)
  return(base_den)
}

p_den <- ggplot(data.frame(x = seq(0,1,by = 0.001)),mapping = aes(x)) + 
  stat_function(fun = function(x)sapply(x,den_fnc), geom = "step") +
  scale_y_continuous(limits = c(0,NA))

## plot histogram ##
init <- runif(n-n_clst)
op <- function(inp){
  FCN <- function(x) abs(F_x(x) - inp)
  return(optim(par = 0.5, lower = 0, upper = 1,
               fn = FCN, method = "L-BFGS-B")$par)
}
y <- unlist(mclapply(init, op))
## if y is 0 or 1, replace y with init because sin 0 and 2pi is 1
x_base <- ifelse(y == 0 | y == 1, init, y)
x_clst <- runif(n_clst, clst_loc-clst_bin, clst_loc+clst_bin)

## cdf x ##
cdf_x <- sort(c(x_base,x_clst))

## x_all ##
x_all <- POT::qgpd(cdf_x, loc = mu, scale = sig, shape = xi)
p_hist <- ggplot(data.frame(x = x_all[x_all<100]), mapping = aes(x)) + 
  geom_histogram(binwidth = 2) + 
  ggtitle("Example of gp+cluster") +
  theme(plot.title = element_text(hjust = 0.5))

print(paste("False positive is",FP, sep = " "))
p_clst
p_den
p_hist

Notes

Insert cluster on GP or uniform

If insert same cluster on GP density function for case1 and case2, the clusters occur on the place where surrending density are different for correpsonding uniform density function. If insert same clsuter on uniform density function, the clusters occur on the place where surrending density are different for correpsonding gp density function. Considering the method scans on uniform density function, the clusters are inserted on uniform density function.

Take a simple example for inserting on a fixed place clst_loc, on uniform.

Case1:

## generate data ##
x_base <- runif(n-n_clst)
x_clst <- rep(clst_loc,n_clst)

## cdf x ##
cdf_x <- sort(c(x_base,x_clst))

ggplot(data.frame(x = cdf_x), mapping = aes(x)) + 
  geom_histogram(binwidth = 0.01)

## x_all ##
x_all <- POT::qgpd(cdf_x, loc = mu, scale = sig, shape = xi)

ggplot(data.frame(x = x_all[x_all<100]), mapping = aes(x)) + 
  geom_histogram(binwidth = 2)

Case2:

## generate data ##
init <- runif(n-n_clst)
op <- function(inp){
  FCN <- function(x) abs(F_x(x) - inp)
  return(optim(par = 0.5, lower = 0, upper = 1,
               fn = FCN, method = "L-BFGS-B")$par)
}
y <- unlist(mclapply(init, op))
## if y is 0 or 1, replace y with init because sin 0 and 2pi is 1
x_base <- ifelse(y == 0 | y == 1, init, y)
x_clst <- rep(clst_loc,n_clst)

## cdf x ##
cdf_x <- sort(c(x_base,x_clst))

ggplot(data.frame(x = cdf_x), mapping = aes(x)) + 
  geom_histogram(binwidth = 0.01)

## x_all ##
x_all <- POT::qgpd(cdf_x, loc = mu, scale = sig, shape = xi)

ggplot(data.frame(x = x_all[x_all<100]), mapping = aes(x)) + 
  geom_histogram(binwidth = 2)

scan direction affects results

case2 revise:

## cos parameter ##
a <- 0.7
## seed ##
RNGkind("L'Ecuyer-CMRG")
set.seed(201911122)

## scan results ##
case2_rev_res <- mclapply(seq.int(sim_rep), case2_fnc)
case2_rev_res <- readRDS("HRR_simulation_files/RDS/case2_sm_rev_res.rds")
## true positive ##
TP_clst <- sapply(case2_rev_res, function(x) sum(x[,3])>0)
TP      <- sum(TP_clst)/sim_rep

#------------------------------------ plot ------------------------------------#
## cumulative detected clusters ##
case2_rev_mtx <- do.call(rbind, case2_rev_res)
cnt_fnc <- function(x) sum(case2_rev_mtx[,1] <= x & case2_rev_mtx[,2] >= x)

p_clst <- ggplot(data.frame(x = seq(0,1,by = 0.001)),mapping = aes(x)) + 
  stat_function(fun = function(x)sapply(x,cnt_fnc), geom = "step") +
  scale_y_continuous(limits = c(0,NA))

## plot density function ##
den_fnc <- function(x){
  base_den <- (1-n_clst/n)*f_x(x)
  clst_den <- n_clst/n*(clst_loc-clst_bin<=x && x <=clst_loc+clst_bin)/
    (2*clst_bin)
  return(base_den + clst_den)
}

p_den <- ggplot(data.frame(x = seq(0,1,by = 0.001)),mapping = aes(x)) + 
  stat_function(fun = function(x)sapply(x,den_fnc), geom = "step") +
  ggtitle("Density Function") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_y_continuous(limits = c(0,NA))

## plot histogram ##
init <- runif(n-n_clst)
op <- function(inp){
  FCN <- function(x) abs(F_x(x) - inp)
  return(optim(par = 0.5, lower = 0, upper = 1,
               fn = FCN, method = "L-BFGS-B")$par)
}
y <- unlist(mclapply(init, op))
## if y is 0 or 1, replace y with init because sin 0 and 2pi is 1
x_base <- ifelse(y == 0 | y == 1, init, y)
x_clst <- runif(n_clst, clst_loc-clst_bin, clst_loc+clst_bin)

## cdf x ##
cdf_x <- sort(c(x_base,x_clst))

## x_all ##
x_all <- POT::qgpd(cdf_x, loc = mu, scale = sig, shape = xi)
p_hist <- ggplot(data.frame(x = x_all[x_all<100]), mapping = aes(x)) + 
  geom_histogram(binwidth = 2) + 
  ggtitle("Example of gp+cluster") +
  theme(plot.title = element_text(hjust = 0.5))

print(paste("True positive is",TP, sep = " "))
p_clst
p_den
p_hist

Extra-large Sample of Case3 with cluster

case3_xl_res <- readRDS("HRR_simulation_files/RDS/case3_c_res.rds")
n <- 30000L
a <- -0.7

#------------------------------ critical values -------------------------------#
## As the size are same cross different cases,
## critical values only need to be computed once.
min_crit <- AnomDetct::crit_theta_fun(N = n, window_l = n_clst, 
                                      p = 1-exp(-theta_th), alpha = alpha_0)

## check if prob_fun used in crit_theta_fun is reliable ##
if(!check_fun(N = n, critical_cnt = min_crit, 
              window_l = n_clst, p = 1-exp(-theta_th))) stop("Invalid input")


## alpha_th may not be same as alpha_0 as min_crit is discrete ##
alpha_th <- AnomDetct::prob_fun(N = n, k = min_crit, 
                                m = n_clst, p = 1-exp(-theta_th))
## seed ##
RNGkind("L'Ecuyer-CMRG")
set.seed(201911121)

## scan results ##
system.time({case3_xl_res <- mclapply(seq.int(sim_rep), case3_fnc_clst)})
## false positive ##
FP_clst <- sapply(case3_xl_res,function(x)nrow(x)>0)
FP      <- sum(FP_clst)/sim_rep

#------------------------------------ plot ------------------------------------#
## cumulative detected clusters ##
case3_xl_res <- do.call(rbind, case3_xl_res)
cnt_fnc <- function(x) sum(case3_xl_res[,1] <= x & case3_xl_res[,2] >= x)

p_clst <- ggplot(data.frame(x = seq(0,1,by = 0.001)),mapping = aes(x)) + 
  stat_function(fun = function(x)sapply(x,cnt_fnc), geom = "step")

## plot density function ##
den_fnc <- function(x){
  base_den <- f_x(x)
  return(base_den)
}

p_den <- ggplot(data.frame(x = seq(0,1,by = 0.001)),mapping = aes(x)) + 
  stat_function(fun = function(x)sapply(x,den_fnc), geom = "step") +
  scale_y_continuous(limits = c(0,NA))

## plot histogram ##
init <- runif(n-n_clst)
op <- function(inp){
  FCN <- function(x) abs(F_x(x) - inp)
  return(optim(par = 0.5, lower = 0, upper = 1,
               fn = FCN, method = "L-BFGS-B")$par)
}
y <- unlist(mclapply(init, op))
## if y is 0 or 1, replace y with init because sin 0 and 2pi is 1
x_base <- ifelse(y == 0 | y == 1, init, y)
x_clst <- runif(n_clst, clst_loc-clst_bin, clst_loc+clst_bin)

## cdf x ##
cdf_x <- sort(c(x_base,x_clst))

## x_all ##
x_all <- POT::qgpd(cdf_x, loc = mu, scale = sig, shape = xi)
p_hist <- ggplot(data.frame(x = x_all[x_all<100]), mapping = aes(x)) + 
  geom_histogram(binwidth = 2) + 
  ggtitle("Example of gp+cluster") +
  theme(plot.title = element_text(hjust = 0.5))

print(paste("False positive is",FP, sep = " "))
p_clst
p_den
p_hist

Geometric explanation

a <- 0.7

df1 <- data.frame(x = seq(0,1,by = 0.001),
                  y = F_x(seq(0,1,by = 0.001)),
                  type = "CDF")

df2 <- data.frame(x = c(0.25,1),
                  y = c(F_x(0.25),1),
                  type = "expt slope")

linear <- function(x) F_x(0.25)-f_x(0.25)*0.25 + f_x(0.25)*x
df3 <- data.frame(x = seq(0,0.5,by = 0.001),
                  y = linear(seq(0,0.5,by = 0.001)),
                  type = "true slope")

df <- rbind(df1,df2,df3)

pt <- data.frame(x = c(0.25,1),
                 y = c(F_x(0.25),1))
pt$coords <- c("(x, y)", "(1, 1)")

p <- ggplot(df) + 
  geom_path(aes(x = x, y = y, color = type, linetype = type)) +
  geom_point(data = pt, aes(x = x, y = y),size = 3, shape=1) + 
  geom_label(data = pt, aes(x = x, y = y+0.1, label=coords)) + 
  ggtitle("CDF") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.key.width=unit(0.5,"cm")) + 
  scale_x_continuous(limits = c(0,1.3))

p

When blue line and green line are overlapped, that means the the expected slope is same as true slop, which means $f(x) = \frac{1-F(x)}{1-x}$, where f(x) is the pdf of x and F(x) is the cdf of x. This can be rewritten as $H_v(x) = \frac{f(x)}{1-F(x)} = \frac{1}{1-x} = H_u(x)$ So only when $H_v(x) = H_u(x)$, the probability is correct without Hazard rate adjustment.

When blue line and green line are not overlapped, as the point $(x,y)$ goes up by one unit,say $\Delta y$, the true increament on x-axis is $\frac{1}{f(x)}\Delta y$ while the expected increament on x-axis is $\frac{1-x}{1-F(x)}\Delta y$.

$\frac{1}{f(x)}\Delta y = \frac{H_u(x)}{H_v(x)}\frac{1-x}{1-F(x)}\Delta y$

So when $H_v(x) != H_u(x)$, we need to adjust the expected increament (distance) by $\frac{H_u(x)}{H_v(x)}$



zhicongz/AnomDetct documentation built on Dec. 12, 2019, 9:16 a.m.