#' Predicts suitable outlier methods from meta-features of the dataset using trained random forest models.
#' @param ftrs Meta-features of the dataset.
#' @param models The trained random forest models. These are obtained from the function \code{TrainModels}.
#'
#' @return The prediction probabilities for each of the outlier method. For example, a probability of 0.78 for the first outlier method means the probability that the first method is good for this dataset is 0.78.
#'
#' @examples
#' \dontrun{
#' data(Arrhythmia_withoutdupl_05_v05)
#' dat <- Arrhythmia_withoutdupl_05_v05
#' feat <- ComputeMetaFeaturesMM(dat)
#' fit <- TrainModels(1,1,1)
#' out <- PredictPerformance(feat, fit)
#' }
#'
#' @export
PredictPerformance <- function(ftrs, models){
ftrs <- as.data.frame(ftrs)
nn <- dim(ftrs)[1]
d <- models$d
p <- models$p
s <- models$s
col_names <- models$col_names
x <- ftrs[,col_names]
if(nn==1){
# only one record
if(d==1){
preds <- matrix(0,nrow=nn, ncol=12)
colnames(preds) <- c("COF", "FAST_ABOD", "INFLO", "KDEOS", "KNN", "KNNW", "LDF", "LDOF", "LOF", "LOOP", "ODIN", "SIMLOF")
preds[,1] <- predict(models$cof,newdata=x, type="prob")[2]
preds[,2]<- predict(models$fabod,newdata=x, type="prob")[2]
preds[,3] <- predict(models$inflo,newdata=x, type="prob")[2]
preds[,4] <- predict(models$kdeos,newdata=x, type="prob")[2]
preds[,5] <- predict(models$knn,newdata=x, type="prob")[2]
preds[,6] <- predict(models$knnw,newdata=x, type="prob")[2]
preds[,7] <- predict(models$ldf,newdata=x, type="prob")[2]
preds[,8] <- predict(models$ldof,newdata=x, type="prob")[2]
preds[,9]<- predict(models$lof,newdata=x, type="prob")[2]
preds[,10] <- predict(models$loop,newdata=x, type="prob")[2]
preds[,11] <- predict(models$odin,newdata=x, type="prob")[2]
preds[,12] <- predict(models$simlof,newdata=x, type="prob")[2]
}else{
# d=2
preds <- matrix(0,nrow=nn, ncol=8)
colnames(preds) <- c("Ensemble_Median_IQR", "LOF_Min_Max", "KNN_Median_IQR", "FAST_ABOD_Min_Max", "iForest_Median_IQR", "KDEOS_Median_IQR", "KDEOS_Min_Max", "LDF_Min_Max")
preds[,1] <- predict(models$mod_ens_med_iqr,newdata=x, type="prob")[2]
preds[,2]<- predict(models$mod_lof_min_max,newdata=x, type="prob")[2]
preds[,3] <- predict(models$mod_knn_med_iqr,newdata=x, type="prob")[2]
preds[,4] <- predict(models$mod_fabod_min_max,newdata=x, type="prob")[2]
preds[,5] <- predict(models$mod_iforest_med_iqr,newdata=x, type="prob")[2]
preds[,6] <- predict(models$mod_kdeos_med_iqr,newdata=x, type="prob")[2]
preds[,7] <- predict(models$mod_kdeos_min_max,newdata=x, type="prob")[2]
preds[,8] <- predict(models$mod_ldf_min_max,newdata=x, type="prob")[2]
}
}else{
if(d==1){
preds <- matrix(0,nrow=nn, ncol=12)
colnames(preds) <- c("COF", "FAST_ABOD", "INFLO", "KDEOS", "KNN", "KNNW", "LDF", "LDOF", "LOF", "LOOP", "ODIN", "SIMLOF")
preds[,1] <- predict(models$cof,newdata=x, type="prob")[ ,2]
preds[,2]<- predict(models$fabod,newdata=x, type="prob")[ ,2]
preds[,3] <- predict(models$inflo,newdata=x, type="prob")[ ,2]
preds[,4] <- predict(models$kdeos,newdata=x, type="prob")[ ,2]
preds[,5] <- predict(models$knn,newdata=x, type="prob")[ ,2]
preds[,6] <- predict(models$knnw,newdata=x, type="prob")[ ,2]
preds[,7] <- predict(models$ldf,newdata=x, type="prob")[ ,2]
preds[,8] <- predict(models$ldof,newdata=x, type="prob")[ ,2]
preds[,9]<- predict(models$lof,newdata=x, type="prob")[ ,2]
preds[,10] <- predict(models$loop,newdata=x, type="prob")[ ,2]
preds[,11] <- predict(models$odin,newdata=x, type="prob")[ ,2]
preds[,12] <- predict(models$simlof,newdata=x, type="prob")[ ,2]
}else{
# d=2
preds <- matrix(0,nrow=nn, ncol=8)
colnames(preds) <- c("Ensemble_Median_IQR", "LOF_Min_Max", "KNN_Median_IQR", "FAST_ABOD_Min_Max", "iForest_Median_IQR", "KDEOS_Median_IQR", "KDEOS_Min_Max", "LDF_Min_Max")
preds[,1] <- predict(models$mod_ens_med_iqr,newdata=x, type="prob")[ ,2]
preds[,2]<- predict(models$mod_lof_min_max,newdata=x, type="prob")[ ,2]
preds[,3] <- predict(models$mod_knn_med_iqr,newdata=x, type="prob")[ ,2]
preds[,4] <- predict(models$mod_fabod_min_max,newdata=x, type="prob")[ ,2]
preds[,5] <- predict(models$mod_iforest_med_iqr,newdata=x, type="prob")[ ,2]
preds[,6] <- predict(models$mod_kdeos_med_iqr,newdata=x, type="prob")[ ,2]
preds[,7] <- predict(models$mod_kdeos_min_max,newdata=x, type="prob")[ ,2]
preds[,8] <- predict(models$mod_ldf_min_max,newdata=x, type="prob")[ ,2]
}
}
return(preds)
}
#' Train models to predict outlier methods from meta-features of datasets.
#'
#' @param d If \code{d=1} then we take the Min_Max performance values, if \code{d=2} then performance values from all normalization methods are considered. Input values for \code{d} are only \code{1,2}.
#' @param p If \code{p=1} then we take binary values based on absolute performance, i.e. if performance \code{ > 0.8}, if \code{p=2} the relative binary performance values are used. Input values for \code{p} are only \code{1,2}.
#' @param rocpr For all normalization methods, if \code{rocpr=1} then area under \code{ROC} curve is used, if \code{rocpr=2} then area under \code{PR} curve is used. For Min_Max method, i.e. if \code{d=1}, then only \code{ROC} values are used.
#' @param s If \code{s=1} then we train the models on a preferred subset of features. If \code{s=2} the models are trained on all features, which takes considerably longer. Default value \code{s=1}. Input values for \code{s} are only \code{1,2}.
#'
#' @return The trained randomforest models.
#'
#' @examples
#' \dontrun{
#' fit <- TrainModels(1,1,1,1)
#' }
#'
#' @export
#'
TrainModels <- function(d, p, rocpr=1, s=1){
# d is for meta-data set
# d = 1 is the min_max performance data set
# d = 2 is for all normalization methods performance dataset
# p = 1 for absolute performance, i.e. if perf > 0.8 then 1 else 0
# p = 2 for relative performance with 0.05, i.e. 1 for |perf - max_perf | < 0.05, 0 otherwise
# s is for subset of features or all features
# s = 1 is for a subset of features
# s = 2 is for all features
# rocpr is for roc or pr. This is only for all norm methods
# rocpr = 1 for roc values
# rocpr = 2 for pr values
print("Training models on meta-features. This will take some time.")
if((d!=1)&(d!=2)){
stop("Invalid d. d should equal 1 or 2.")
}
if((p!=1)&(p!=2)){
stop("Invalid p. p should equal 1 or 2.")
}
if((s!=1)&(s!=2)){
stop("Invalid s. s should equal 1 or 2.")
}
if((rocpr!=1)&(rocpr!=2)){
stop("Invalid rocpr. rocpr should equal 1 or 2.")
}
e <- new.env()
if(d==1){
# ---- ONLY MIN_MAX NORMALIZATION METHOD
data(features_mm, envir = e)
if(s==1){
col_list <- c('OPO_Res_ResOut_Median_1', 'OPO_Den_Out_95P_1', 'Mean_Entropy_Attr', 'OPO_Res_Out_Mean_1', 'OPO_GDeg_PO_Mean_1', 'IQR_TO_SD_95', 'OPO_GDeg_Out_Mean_1')
col_nums <- which(colnames(features_mm) %in% col_list )
ftr_subset <- features_mm[ ,col_nums]
}else if (s==2){
# all features
col_nums <- 1:dim(features_mm)[2]
ftr_subset <- features_mm
}
if(p==1){
# absolute performance 1 or 0
data(abs_perfs_mm, envir = e)
perfs <- abs_perfs_mm
}else if(p==2){
# relative performance 1 or 0
data(rel_perfs_0.05_mm, envir = e)
perfs <- rel_perfs_0.05_mm
}
}else if(d==2){
# ---- ALL NORMALIZATION METHODS - PERFORMANCE AND FEATURES
if(rocpr==1){
# ROC values
data(features_all, envir = e)
data(perf_vals_roc_subset, envir = e)
perf_vals <- perf_vals_roc_subset
feat <- features_all
col_list <- c('OPO_Res_ResOut_Mean_3', 'OPO_GDeg_Out_Mean_3', 'OPO_GComp_PO_Mean_1', 'MEAN_PROD_IQR', 'OPO_Res_Out_Mean_3', 'OPO_Res_ResOut_Max_3', 'OPO_Out_LocDenOut_1_3')
}else{
# PR values
stop("This functionality will be added in the near future.")
data(features_all_pr, envir = e)
data(perf_vals_pr_subset, envir = e)
perf_vals <- perf_vals_pr_subset
feat <- features_all_pr
# UPDATE COL_LIST
col_list <- c(0)
}
if(s==1){
col_nums <- which(colnames(feat) %in% col_list )
ftr_subset <- feat[ ,col_nums]
}else if (s==2){
col_nums <- 1:dim(feat)[2]
ftr_subset <- feat
}
if(p==1){
# absolute performance 1 or 0
abs_perf_1_0 <- apply(perf_vals, 2, function(x) ifelse(x>=0.8,1,0))
perfs <- abs_perf_1_0
}else if(p==2){
# relative performance 1 or 0
relative_perf_eps_0.05 <- EpsilonGood(perf_vals, 0.05)
perfs <- relative_perf_eps_0.05
}
}
if(d==1){
# Train models
mod_cof <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,1]) )
mod_fabod <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,2]) )
mod_inflo <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,3]) )
mod_kdeos <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,4]) )
mod_knn <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,5]) )
mod_knnw <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,6]) )
mod_ldf <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,7]) )
mod_ldof <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,8]) )
mod_lof <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,9]) )
mod_loop <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,10]) )
mod_odin <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,11]) )
mod_simlof <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,12]) )
models <- list()
models$cof <- mod_cof
models$fabod <- mod_fabod
models$inflo <- mod_inflo
models$kdeos <- mod_kdeos
models$knn <- mod_knn
models$knnw <- mod_knnw
models$ldf <- mod_ldf
models$ldof <- mod_ldof
models$lof <- mod_lof
models$loop <- mod_loop
models$odin <- mod_odin
models$simlof <- mod_simlof
}else{
# d =2
mod_ens_med_iqr <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,1]) )
mod_lof_min_max <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,2]) )
mod_knn_med_iqr <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,3]) )
mod_fabod_min_max <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,4]) )
mod_iforest_med_iqr <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,5]) )
mod_kdeos_med_iqr <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,6]) )
mod_kdeos_min_max <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,7]) )
mod_ldf_min_max <- randomForest::randomForest(ftr_subset, as.factor(perfs[ ,8]) )
models <- list()
models$mod_ens_med_iqr <- mod_ens_med_iqr
models$mod_lof_min_max <- mod_lof_min_max
models$mod_knn_med_iqr <- mod_knn_med_iqr
models$mod_fabod_min_max <- mod_fabod_min_max
models$mod_iforest_med_iqr <- mod_iforest_med_iqr
models$mod_kdeos_med_iqr <- mod_kdeos_med_iqr
models$mod_kdeos_min_max <- mod_kdeos_min_max
models$mod_ldf_min_max <- mod_ldf_min_max
}
models$cols <- col_nums
models$d <- d
models$p <- p
models$s <- s
models$col_names <- col_list
models$rocpr <- rocpr
return(models)
}
#' Cross validates models to reproduce results in journal and conference papers.
#'
#' @inheritParams TrainModels
#' @param n The number of folds in cross validation.
#'
#' @return A list containing the following results from the cross validated models:
#' \describe{
#' \item{def_acc}{The default accuracy we get if we predict the method is not good for all instances. This is the percentage of the majority class.}
#' \item{results}{The \code{n}-fold cross valdation results. }
#' \item{mean_acc}{The mean \code{n}-fold cross valdation results.}
#' }
#'
#' @examples
#' \dontrun{
#' out <- CrossValidateModels(1,1,1,1,5)
#' out$mean_acc
#' }
#'
#' @export
CrossValidateModels <- function(d, p, rocpr=1, s=1, n=5){
# d is for meta-data set
# d = 1 is the min_max performance data set
# d = 2 is for all normalization methods performance dataset
# p = 1 for absolute performance, i.e. if perf > 0.8 then 1 else 0
# p = 2 for relative performance with 0.05, i.e. 1 for |perf - max_perf | < 0.05, 0 otherwise
# s is for subset of features or all features
# s = 1 is for a subset of features
# s = 2 is for all features
if((d!=1)&(d!=2)){
stop("Invalid d. d should equal 1 or 2.")
}
if((p!=1)&(p!=2)){
stop("Invalid p. p should equal 1 or 2.")
}
if((s!=1)&(s!=2)){
stop("Invalid s. s should equal 1 or 2.")
}
if((rocpr!=1)&(rocpr!=2)){
stop("Invalid rocpr. rocpr should equal 1 or 2.")
}
if(n > 10){
stop("Consider n less than or equal to 10.")
}
e <- new.env()
if(d==1){
# ---- ONLY MIN_MAX NORMALIZATION METHOD
data(features_mm, envir=e)
filenames <- features_mm$filename
data("perf_vals_mm", envir=e)
perf_best <- apply(perf_vals_mm, 1, which.max)
if(s==1){
col_list <- c('OPO_Res_ResOut_Median_1', 'OPO_Den_Out_95P_1', 'Mean_Entropy_Attr', 'OPO_Res_Out_Mean_1', 'OPO_GDeg_PO_Mean_1', 'IQR_TO_SD_95', 'OPO_GDeg_Out_Mean_1')
col_nums <- which(colnames(features_mm) %in% col_list )
ftr_subset <- features_mm[ ,col_nums]
}else if (s==2){
# all features
col_nums <- 1:dim(features_mm)[2]
ftr_subset1 <- features_mm
sds <- apply(ftr_subset1, 2, sd)
rm_cols <- c(1, which(sds==0)) # to remove the filename
col_nums <- setdiff(col_nums,rm_cols)
ftr_subset <- features_mm[ ,col_nums]
cat("Using all", length(col_nums), "features. This will take REALLY LONG! \n")
}
if(p==1){
# absolute performance 1 or 0
data(abs_perfs_mm, envir=e)
perfs <- abs_perfs_mm
}else if(p==2){
# relative performance 1 or 0
data(rel_perfs_0.05_mm, envir=e)
perfs <- rel_perfs_0.05_mm
}
}else if(d==2){
# ---- ALL NORMALIZATION METHODS - PERFORMANCE AND FEATURES
if(rocpr==1){
# ROC values are used
data(features_all, envir=e)
filenames <- features_all$filename
feat <- features_all
col_list <- c('OPO_Res_ResOut_Mean_3', 'OPO_GDeg_Out_Mean_3', 'OPO_GComp_PO_Mean_1', 'MEAN_PROD_IQR', 'OPO_Res_Out_Mean_3', 'OPO_Res_ResOut_Max_3', 'OPO_Out_LocDenOut_1_3')
data("perf_vals_roc_subset", envir=e)
perf_vals <- perf_vals_roc_subset
}else{
# PR values are used
stop("This functionality will be added in the near future.")
data(features_all_pr, envir=e)
filenames <- features_all_pr$filename
feat <- features_all_pr
data("perf_vals_pr_subset", envir=e)
perf_vals <- perf_vals_pr_subset
# NEED TO ADD COL_LIST
col_list <- c()
}
perf_best <- apply(perf_vals, 1, which.max)
if(s==1){
col_nums <- which(colnames(features_all) %in% col_list )
ftr_subset <- feat[ ,col_nums]
}else if (s==2){
col_nums <- 1:dim(features_all)[2]
ftr_subset1 <- features_all
sds <- apply(ftr_subset1, 2, sd)
rm_cols <- c(1, which(sds==0)) # to remove the filename
col_nums <- setdiff(col_nums,rm_cols)
ftr_subset <- features_all[ ,-rm_cols]
cat("Using all", length(col_nums), "features. This will take REALLY LONG! \n")
}
if(p==1){
# absolute performance 1 or 0
abs_perf_1_0 <- apply(perf_vals, 2, function(x) ifelse(x>=0.8,1,0))
perfs <- abs_perf_1_0
}else if(p==2){
# relative performance 1 or 0
relative_perf_eps_0.05 <- EpsilonGood(perf_vals, 0.05)
perfs <- relative_perf_eps_0.05
}
}
# features are in ftr_subset
# performance values in perfs
result_table <- matrix(0, nrow=n, ncol=dim(perfs)[2])
pred_algs <- matrix(0, nrow=length(filenames), ncol=dim(perfs)[2])
colnames(pred_algs) <- colnames(perfs)
pred_best <- matrix(0, nrow=length(filenames), ncol=dim(perfs)[2])
colnames(pred_best) <- colnames(perfs)
# Cross validation on file source as many variants of the same file exist
file_source <-GetFileSources(filenames)
uniq_f_s <- unique(file_source)
# Create n equally size folds
set.seed(1234)
new_order <- sample(uniq_f_s,length(uniq_f_s))
folds <- cut(seq(1,length(uniq_f_s)),breaks=n,labels=FALSE)
cat("Starting", n, "fold cross validation. \n")
ftr_subset <- apply(ftr_subset,2,unitize_1)
# Perform n fold cross validation
for(i in 1:n){
testSources <- new_order[which(folds==i,arr.ind=TRUE)]
testIndices <- which(file_source %in% testSources)
# Segement your data by fold
testData <- ftr_subset[testIndices, ]
trainData <- ftr_subset[-testIndices, ]
testLabels <- perfs[testIndices, ]
trainLabels <- perfs[-testIndices, ]
# Train model for best method
testLabBest <- perf_best[testIndices]
trainLabBest <- perf_best[-testIndices]
modelBest <-randomForest::randomForest(trainData, as.factor(paste(trainLabBest)))
pred_best[testIndices,] <- predict(modelBest, testData, type="prob" )
for(j in 1:dim(perfs)[2]){
cat("Fold ", i, " Method " , j, "... \n")
model <- randomForest::randomForest(trainData, as.factor(trainLabels[ ,j]))
preds <- predict(model, testData, type="class")
pred_algs[testIndices,j] <- predict(model, testData, type="prob")[ ,2]
result_table[i,j] <- sum(preds==testLabels[ ,j])/length(testLabels[ ,j])
print(paste("Accuracy", result_table[i,j]) )
}
}
default_accuracy <- apply(perfs, 2, table)*100/dim(perfs)[1]
default_accuracy <- apply(default_accuracy,2, max)
out <- list()
out$def_acc <- default_accuracy
out$results <-result_table
out$mean_acc <- apply(result_table, 2, mean)
out$algo_preds <- pred_algs
out$pred_best <- pred_best
return(out)
}
#' Cross validates SVM with instance space coordinates.
#'
#'@inheritParams CrossValidateModels
#'
#'@return A list with the following components:
#'\describe{
#' \item{def_acc}{The default accuracy we get if we predict the method is not good for all instances. This is the percentage of the majority class.}
#' \item{results}{The \code{n}-fold cross valdation results. }
#' \item{mean_acc}{The mean \code{n}-fold cross valdation results.}
#' \item{d}{Which instance space is cross validated. If \code{d=1}, then it is MIN_MAX normalization instance space, if \code{d=2} then all normalization methods are used. }
#' \item{coordinates}{The instance space coordinates.}
#' }
#'
#'@examples
#'\dontrun{ out <- CrossValidateSVM(1,5) }
#'
#'@export
#'
CrossValidateSVM <- function(d=1,n=5){
if((d!=1)&(d!=2)){
stop("Invalid d. d should equal 1 or 2.")
}
if(n > 10){
stop("Consider n less than or equal to 10.")
}
e <- new.env()
if(d==1){
# coordinates for min-max normalization
data("dat_4_svm_mm", envir=e)
coordinates <- dat_4_svm_mm[, c(2,3)]
perfs <- dat_4_svm_mm[ ,4:15]
filenames <- dat_4_svm_mm[, 1]
cst <- 150
gmv <- 0.75
}else{
data("data_4_svm_mix_5K", envir=e)
coordinates <- data_4_svm_mix_5K[, c(2,3)]
perfs <- data_4_svm_mix_5K[ ,4:11]
filenames <- data_4_svm_mix_5K[, 1]
cst <- 50
gmv <- 0.25
}
algo_names <- colnames(perfs)
# Cross validation on file source as many variants of the same file exist
file_source <-GetFileSources(filenames)
uniq_f_s <- unique(file_source)
# Create n equally size folds
set.seed(1234)
new_order <- sample(uniq_f_s,length(uniq_f_s))
folds <- cut(seq(1,length(uniq_f_s)),breaks=n,labels=FALSE)
cat("Starting", n, "fold cross validation. \n")
result_table <- matrix(0, nrow=n, ncol=dim(perfs)[2])
# Perform n fold cross validation
for(i in 1:n){
testSources <- new_order[which(folds==i,arr.ind=TRUE)]
testIndices <- which(file_source %in% testSources)
# Segement your data by fold
testData <- coordinates[testIndices, ]
trainData <- coordinates[-testIndices, ]
testLabels <- perfs[testIndices, ]
trainLabels <- perfs[-testIndices, ]
for(j in 1:dim(perfs)[2]){
cat("Fold ", i, " Method " , j, "... \n")
model <- e1071::svm(trainData, as.factor(trainLabels[ ,j]), cost = cst, gamma=gmv)
preds <- predict(model, testData, type="class")
result_table[i,j] <- sum(preds==testLabels[ ,j])/length(testLabels[ ,j])
print(paste("Accuracy", result_table[i,j]) )
}
}
default_accuracy <- apply(perfs, 2, table)*100/dim(perfs)[1]
default_accuracy <- apply(default_accuracy,2, max)
out <- list()
out$def_acc <- default_accuracy
out$results <-result_table
out$mean_acc <- apply(result_table, 2, mean)*100
out$d <- d
out$coordinates <- coordinates
out$algonames <- algo_names
return(out)
}
#' Trains an SVM on instance space coordinates and draws the instance space.
#'
#' @inheritParams CrossValidateSVM
#' @param vis If \code{TRUE} then the instance space is plotted.
#'
#' @return A list with the following components:
#'\describe{
#' \item{preds10}{The predictions as \code{1} or \code{0} for each instance and each algorithm.}
#' \item{predprobs}{The prediction probabilities for each each instance and each algorithm.}
#' \item{algorithms}{The algorithm best suited for each instance. For some instances, no algorithms work well.}
#' \item{d}{Which instance space is cross validated. If \code{d=1}, then it is MIN_MAX normalization instance space, if \code{d=2} then all normalization methods are used. }
#' \item{coordinates}{The instance space coordinates.}
#' }
#'
#'@examples
#'\dontrun{ svmout <- InstSpace(d=1)}
#'
#'@export
#'
InstSpace <- function(d=1, vis=FALSE){
if((d!=1)&(d!=2)){
stop("Invalid d. d should equal 1 or 2.")
}
e <- new.env()
if(d==1){
# Instance space using MIN_MAX
data("dat_4_svm_mm", envir=e)
filenames <- dat_4_svm_mm[ ,1]
xx <- dat_4_svm_mm[, 2:3]
perfs <- dat_4_svm_mm[ , 4:15]
cst <- 150
gmv <- 0.75
num_methods <- dim(perfs)[2]
print("Training 12 SVMS for outlier detection methods. This will take some time.")
}else{
# Instance space using all normalization methods
data("data_4_svm_mix_5K", envir=e)
filenames <- data_4_svm_mix_5K[ ,1]
xx <- data_4_svm_mix_5K[, 2:3]
perfs <- data_4_svm_mix_5K[ , 4:11]
cst <- 50
num_methods <- dim(perfs)[2]
gmv <- 0.25 # 1/num_methods #
print("Training 8 SVMS for outlier detection methods. This will take some time.") }
# Train an SVM
preds.all <- matrix(0,nrow=nrow(xx), ncol=num_methods)
preds.all.1.0 <- matrix(0,nrow=nrow(xx), ncol=num_methods)
for(kk in 1:num_methods){
dd2 <- cbind.data.frame(xx, as.factor(perfs[,kk]))
colnames(dd2) <- c("x", "y","Label" )
# Class weights - BEGIN
costs <- table(dd2$Label)
costs[1] <- 1
costs[2] <- c(1,round(table(dd2$Label)[1]/table(dd2$Label)[2]))[1]
# Class weights - END
mod.big <- e1071::svm(Label~., data=dd2, probability=TRUE, class.weights=costs, scale=FALSE, cost=cst, gamma=gmv )
pf.all <- predict(mod.big, dd2, probability=TRUE)
col <- which(colnames(attr(pf.all, "probabilities"))=="1")
preds.all[,kk] <- attr(pf.all, "probabilities")[,col]
preds.all.1.0[,kk] <- ifelse(preds.all[,kk] >=0.5,1,0)
}
colnames(preds.all.1.0)<- colnames(perfs)
tt3 <- apply(preds.all.1.0,1,sum)
multiples <- which(tt3>1)
pp.multi <- apply(preds.all[multiples,],1,function(x) which.max(x))
pred.m <- rep(0,dim(preds.all.1.0)[1])
pred.m[multiples] <-pp.multi
qq <- apply(preds.all.1.0[-multiples,],1,function(x) ifelse(sum(x)==1,which(x==1), 0 ) )
pred.m[is.na(pred.m)] <- 0
pred.m[-multiples] <- qq
head(pred.m)
algorithms <- pred.m
if(d==1){
algorithms[pred.m==0] <- "None"
algorithms[pred.m==1] <- "COF"
algorithms[pred.m==2] <- "FAST ABOD"
algorithms[pred.m==3] <- "INFLO" # "None" #
algorithms[pred.m==4] <- "KDEOS"
algorithms[pred.m==5] <- "KNN"
algorithms[pred.m==6] <- "KNNW"
algorithms[pred.m==7] <- "LDF" # "None" #
algorithms[pred.m==8] <- "LDOF" #
algorithms[pred.m==9] <- "LOF" #
algorithms[pred.m==10] <- "LOOP" #
algorithms[pred.m==11] <- "ODIN" #
algorithms[pred.m==12] <- "SIMLOF" #
}else{
algorithms[pred.m==0] <- "None"
algorithms[pred.m==1] <- "Ensemble_Median_IQR"
algorithms[pred.m==2] <- "LOF_Min_Max"
algorithms[pred.m==3] <- "KNN_Median_IQR"
algorithms[pred.m==4] <- "FAST_ABOD_Min_Max"
algorithms[pred.m==5] <- "iForest_Median_IQR"
algorithms[pred.m==6] <- "KDEOS_Median_IQR"
algorithms[pred.m==7] <- "KDEOS_Min_Max"
algorithms[pred.m==8] <- "LDF_Min_Max"
}
if(vis){
algorithms <- as.factor(algorithms)
print( ggplot2::ggplot(data=xx, ggplot2::aes(x,y))+ ggplot2::geom_point(ggplot2::aes(color=algorithms, shape=algorithms)) + ggplot2::scale_shape_manual(values=1:nlevels(algorithms)) + ggplot2::theme_bw() )
}
out <- list()
out$preds10 <- preds.all.1.0
out$predprobs <- preds.all
out$algorithms <- algorithms
out$filenames <- filenames
out$d <- d
out$coordinates <- xx
return(out)
}
#' Plots a new instance in the instance space.
#'
#' @param svm_out The output of the trained svm using function \code{InstSpace}
#' @param feat The features of the new instance/dataset. This can be computed using \code{ComputeMetaFeaturesAll} or \code{ComputeMetaFeaturesMM}.
#' @param vis If \code{TRUE} then the instance space along with the new instance is plotted.
#'
#' @return new_coords Coodinates of the new instance in the instance space
#'
#' @examples
#' \dontrun{
#' data(Arrhythmia_withoutdupl_05_v05)
#' dat <- Arrhythmia_withoutdupl_05_v05
#' feat <- ComputeMetaFeaturesMM(dat)
#' svmout <- InstSpace(d=1)
#' PlotNewInstance(svmout, feat, vis=FALSE)
#' }
#'
#'@export
PlotNewInstance <- function(svm_out, feat, vis=TRUE){
# ---- NEED TO UPDATE THIS FUNCTION
# ---- FEAT NEED TO BE TRANSFORMED FIRST
d <- svm_out$d
coordinates <- svm_out$coordinates
algorithms <- svm_out$algorithms
feat <- as.data.frame(feat)
if(d==1){
# MIN_MAX NORMALIZATION
# This needs to be updated
A = c(-0.0460080647739237,0.120232405807619,-0.0861514033600992,-0.0938346307541302,0.185445325925922,0.173687193359812,0.354286254682629,-0.284658120332950,0.0379944148808954,-0.207772324871124,-0.202465281281364,-0.0821669746311897,0.184527360695248,-0.132454773210594)
proj_mat <- matrix( A, nrow=2) # , byrow=TRUE
col_names <- c("OPO_Res_ResOut_Median_1", "OPO_Den_Out_95P_1", "Mean_Entropy_Attr", "OPO_Res_Out_Mean_1", "OPO_GDeg_PO_Mean_1", "IQR_TO_SD_95", "OPO_GDeg_Out_Mean_1" )
x_bef <- feat[ , col_names]
#xx <- renormalize(x_bef, d)
xx <- x_bef
}else{
# ALL NORMALIZATION METHODS
col_names <- c("OPO_Res_ResOut_Mean_3","OPO_GDeg_Out_Mean_3", "OPO_GComp_PO_Mean_1", "MEAN_PROD_IQR", "OPO_Res_Out_Mean_3", "OPO_Res_ResOut_Max_3", "OPO_Out_LocDenOut_1_3" )
x_bef <- feat[ , col_names]
xx <- renormalize(x_bef, d)
proj_mat <- matrix( c(-0.00292627076652680,-0.632192408787632,-0.569663704435377,-0.174829830230183,0.0268201438117375,-0.290501129270193,-0.139810891656474, 0.332268685436933,-0.224517922081636,-0.0629031205759270,0.182180599272572,0.543197310499069,0.427965822151968,0.389217122011117 ), nrow=2, byrow=TRUE )
}
new_coords <- t(proj_mat%*%t(xx))
colnames(new_coords) <- colnames(coordinates)
if(vis){
algorithms <- as.factor(algorithms)
print(ggplot2::ggplot(data=coordinates, ggplot2::aes(x,y))+ ggplot2::geom_point(ggplot2::aes(color=algorithms)) + ggplot2::geom_point(ggplot2::aes(x=new_coords[1],y=new_coords[2]), color="black", shape=17, size=4) + ggplot2::theme_bw() )
}
return(new_coords)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.