R/RunInvBag.R

#' @title Inverse Bagging algorithm
#' @description
#' The package is an R implementation of the Inverse Bagging algorithm. 
#' Given two sets: a training one containing observations from a certain distribution 
#' and a testing one which can include as well anomalous observations generated 
#' from a different unknown distribution. 
#' The algorithm returns scores for the testing set observations relating to their anomalous evidence.
#'
#' @param train data frame - Data generated by a certain distribution (so-called background). 
#' @param test data frame - Data mostly generated by the background with possible signal observations.
#' @param Q numeric - Size of sampled sets, which cannot be larger than the test data size
#' @param Mboot numeric - Number of sampling iterations.
#' @param R numeric - Method for scores computation:
#' \itemize{
#' \item 1 - Test statistic score
#' \item 2 - P-value score
#' \item 3 - OK scores
#' } 
#' @return A numeric vector of scores for respective observations from the test data set.
#' @references Vischia, P. and Tommaso D., "The Inverse Bagging Algorithm: Anomaly Detection by Inverse Bootstrap Aggregating." EPJ Web of Conferences. Vol. 137. EDP Sciences, 2017.
#' @examples 
#' #Generate multivariate data
#'require(mvtnorm)
#' set.seed(1)
#'n <- 2000
#'B <- 0.05
#'P <- 4
#'train <- rmvnorm(n, sigma = diag(P))
#'
#'#add anomalies with mean shited from sero
#'test <- rbind(
#'  rmvnorm(n*(1-B), sigma = diag(P)), 
#'  rmvnorm(n*B, mean = rep(2,P),sigma = diag(P))
#')
#'
#'#Compute the score
#'results <- RunInvBag(train, test)
#'
#'#Plot results
#'plot(rersults)
#' @export
RunInvBag <- function(train, test, Q=NA, Mboot2=10000, R=1){
#  ibag <- mclapply(c(4,8,12,19), function(N_rand_forest){
  
  P <- dim(test)[2]
  n <- dim(test)[1]
  if(is.na(Q)) Q <- n/10 else if (Q>n) return("Q is larger than test data size")
  
  Sigma <- cov(train)
  inv_cov <- solve(Sigma)
  
  IB <- matrix(0, ncol=n, nrow = 4)
  IB[4,] <- 1
  
  importance <- rep(0,P)
  counts<- rep(0,P)
  w <- rep(1000,n)
  w_feat <- rep(100,P)
  w_save <- w_feat
  
  for(iter in 1:(Mboot2)){
    set.seed(iter)
    sample <- sample(1:n, Q, replace = T)
    p <- classify(test, sample, Q, P, inv_cov)
    for(iter2 in 1:Q){
      IB[,sample[iter2]] <- IB[,sample[iter2]] + p
    }
  }
  if(R==1)return(IB[1,]/IB[4,]) else
    if(R==2)return(IB[2,]/IB[4,]) else 
      return(IB[3,]/IB[4,])
}

classify <- function(Y1, sample, n, p, inv_cov, vars){
  means <- as.matrix(apply(Y1[sample,],2,mean))
  T2 <- t(means)%*%inv_cov%*%means*n*(n-p)/(n-1)/p
  #T2 <- t(means)%*%diag(rep(1,p))%*%means*n*(n-p)/(n-1)/p
  pval <- 1-pf(T2,p,n-p)
  c(T2, pval, as.numeric(pval<0.05),1)
}
Grzes91/InvBag documentation built on May 9, 2019, 2:19 a.m.