# -------------------------------------------------------------------------------------------------
# -------------------------------------------------------------------------------------------------
# R Code
# Meta feature library
# Matthias Reif 2012
# A Comprehensive Dataset for Evaluating Approaches of various Meta-Learning Tasks
# -------------------------------------------------------------------------------------------------
# -------------------------------------------------------------------------------------------------
n_col <- function(x) {
dim(x)[2]
}
n_row <- function(x) {
dim(x)[1]
}
get_class_indx <- function(data) {
length(data)
}
get_col_name <- function(data, col) {
names(data)[col]
}
get_att_names <- function(data) {
names(data)[-get_class_indx(data)]
}
get_class_name <- function(data) {
get_col_name(data, get_class_indx(data))
}
get_num_att <- function(data) {
length(data) - 1
}
get_classes <- function(data) {
levels(data[,get_class_indx(data)])
}
get_column_of_class <- function(data, column, class) {
class_index <- get_class_indx(data)
m <- data[class_index]==class
data[,column][m]
}
get_att_data <- function(data) {
class_index <- get_class_indx(data)
data[-class_index]
}
get_class_data <- function(data) {
data[get_class_indx(data)]
}
get_num_classes <- function(data) {
length(get_classes(data))
}
get_num_numeric_att <- function(data) {
att_data <- get_att_data(data)
if(is.factor(att_data)) {
n <- 0
} else if(is.numeric(att_data)) {
n <- 1
} else {
n <- 0
for(att in 1:n_col(att_data)) {
n <- n + is.numeric(att_data[,att])
}
}
n
}
get_num_nominal_att <- function(data) {
get_num_att(data) - get_num_numeric_att(data)
}
get_num_samples <- function(data) {
n_row(data)
}
normalize <- function(data) {
data_2 <- data.frame(data)
num_att <- get_num_att(data_2)
for(col in 1:num_att) {
if(is.numeric(data[,col])) {
min_v = min(data_2[,col])
max_v = max(data_2[,col])
d <- (max_v - min_v)
if(d == 0) {
d <- 1.0
}
data_2[,col] <- (data_2[,col] - min_v) / d
}
}
return(data_2)
}
replace_nominal_column <- function(column, type=1) {
column <- as.factor(column)
symbols <- levels(column)
result <- {}
for (i in 1:(length(symbols)-type)) {
result <- cbind(result, as.double(column == symbols[i]))
}
result
}
replace_nominal <- function(data, skip=numeric(0)) {
result <- {}
for(i in 1:n_col(data)) {
column <- data[,i]
if(is.numeric(column) || i %in% skip) {
if(is.numeric(column)) {
result <- cbind(result, column)
} else {
result <- cbind(result, data[i])
}
} else {
result <- cbind(result, replace_nominal_column(column))
}
}
data.frame(result)
}
replace_nominal_att <- function(data) {
replace_nominal(data, skip=get_class_indx(data))
}
tree.depth <- function(nodes) {
depth <- floor(log(nodes, base=2) + 1e-7);
as.vector(depth - min(depth));
}
##########################################################################
## Feature Selection for Meta-learning ##
## Alexandros Kalousis and Melanie Hilario ##
## Advances in Knowledge Discovery and Data Mining ##
## Lecture Notes in Computer Science, 2001, Volume 2035/2001, 222-233 ##
## DOI: 10.1007/3-540-45357-1_26 ##
##########################################################################
get_symbol_stats <- function(data) {
n <- numeric(0)
for(i in 1:n_col(data)) {
column <- data[,i]
if(!is.numeric(column)) {
n <- c(n, length(levels(as.factor(column))))
}
}
result <- get_min_max_mean_sd(n, "symbols")
result["symbols_sum"] <- sum(n)
result
}
get_single_normalized_entropies <- function(data) {
m <- numeric(0)
att_data <- get_att_data(data)
for(i in 1:n_col(att_data)) {
m[paste("att_entr", i, sep="_")] <- get_normalized_entropy(att_data[i])
}
m
}
get_single_mutual_information <- function(data) {
m <- numeric(0)
att_data <- get_att_data(data)
labels <- get_class_data(data)
for(i in 1:n_col(att_data)) {
m[paste("att_mut_inf", i, sep="_")] <- get_mutual_information_column(att_data[i], labels)
}
m
}
##########################################################################
## Meta-data: Characterization of Input Features for Meta-learning ##
## Ciro Castiello, Giovanna Castellano and Anna Maria Fanelli ##
## Modeling Decisions for Artificial Intelligence ##
## Lecture Notes in Computer Science, 2005, Volume 3558/2005, 457-468 ##
## DOI: 10.1007/11526018_45 ##
##########################################################################
get_entropy <- function(column, method="ML") {
column <- column[,1]
if(is.numeric(column)) {
column <- infotheo::discretize(column, disc="equalwidth")
}
infotheo::entropy(column)
}
get_normalized_entropy <- function(column) {
column <- column[,1]
if(is.numeric(column)) {
column <- infotheo::discretize(column, disc="equalwidth")
n <- round(sqrt(dim(column)[1])) # -- number of bins = square root of number of samples --
} else {
n <- length(levels(as.factor(column)))
}
infotheo::entropy(column) / log(n)
}
get_avg <- function(data, f) {
att_data <- get_att_data(data)
avg <- 0.0
for(i in 1:n_col(att_data)) {
avg <- avg + f(att_data[i])
}
avg / n_col(att_data)
}
get_avg_with_labels <- function(data, f) {
att_data <- get_att_data(data)
labels <- get_class_data(data)
avg <- 0.0
for(i in 1:n_col(att_data)) {
avg <- avg + f(att_data[i], labels)
}
avg / n_col(att_data)
}
get_attribute_entropy <- function(data) {
get_avg(data, get_entropy)
}
get_normalized_attribute_entropy <- function(data) {
get_avg(data, get_normalized_entropy)
}
get_class_entropy <- function(data) {
infotheo::entropy(get_class_data(data))
}
get_normalized_class_entropy <- function(data) {
get_normalized_entropy(get_class_data(data))
}
get_joint_entropy_column <- function(column, labels) {
column <- column[,1]
labels <- labels[,1]
if(is.numeric(column)) {
column <- infotheo::discretize(column, disc="equalwidth")
}
infotheo::entropy(paste(column, labels))
}
get_joint_entropy <- function(data) {
get_avg_with_labels(data, get_joint_entropy_column)
}
get_mutual_information_column <- function(column, labels) {
column <- column[,1]
labels <- labels[,1]
if(is.numeric(column)) {
column <- infotheo::discretize(column, disc="equalwidth")
}
infotheo::mutinformation(column, labels)
}
get_mutual_information <- function(data) {
get_avg_with_labels(data, get_mutual_information_column)
}
get_equivalent_number_of_attributes <- function(data) {
get_class_entropy(data) / get_mutual_information(data)
}
get_noise_signal_ratio <- function(data) {
(get_attribute_entropy(data) - get_mutual_information(data)) / get_mutual_information(data)
}
get_skewness <- function(data, type=1) {
num_att <- get_num_att(data)
classes <- get_classes(data)
skew <- 0.0
for(class in classes) {
s <- 0.0
n <- 0.0
for(col in 1:num_att) {
att_data_class <- get_column_of_class(data, col, class)
if(is.numeric(att_data_class)) { # -- skip nominal attributes --
v <- e1071::skewness(att_data_class, type=type)
if(!is.nan(v) && !is.na(v)) { # -- NaN e.g. if the attribute has equal values for one class --
s <- s + abs(v)
n <- n + 1.0
}
}
}
if(n > 0.0) { # -- 0 e.g. if one class with only one smaple
skew <- skew + (s / n)
}
}
skew / (length(classes))
}
get_kurtotis <- function(data, type=1) {
num_att <- get_num_att(data)
classes <- get_classes(data)
kurtosis <- 0.0
for(class in classes) {
s <- 0.0
n <- 0.0
for(col in 1:num_att) {
att_data_class <- get_column_of_class(data, col, class)
if(is.numeric(att_data_class)) { # -- skip nominal attributes --
v <- e1071::kurtosis(att_data_class, type=type)
if(!is.nan(v) && !is.na(v)) { # -- NaN e.g. if the attribute has equal values for one class --
s <- s + v
n <- n + 1.0
}
}
}
if(n > 0.0) { # -- 0 e.g. if one class with only one smaple
kurtosis <- kurtosis + (s / n)
}
}
(kurtosis / (length(classes))) + 3.0
}
get_cancors <- function(data) {
att_data <- get_att_data(data)
labels <- get_class_data(data)
preprocess_att_data <- replace_nominal(att_data)
preprocess_labels <- replace_nominal(labels)
cancor(preprocess_att_data, preprocess_labels)$cor
}
get_cancor <- function(data, n) {
m = numeric(0)
c <- get_cancors(data)[1:n]
for(i in 1:n) {
m[paste("cancor",i, sep='_')] <- c[i]
}
m
}
get_fracts <- function(data) {
cancors <- get_cancors(data)
cancor2 = cancors * cancors
cumsum(cancor2) / sum(cancor2)
}
get_fract <- function(data, n) {
m = numeric(0)
c <- get_fracts(data)[1:n]
for(i in 1:n) {
m[paste("fract",i, sep='_')] <- c[i]
}
m
}
get_abs_cor <- function(data) {
num_att <- get_num_att(data)
if(num_att > 1) { # -- correlation between attributes -> one attribute useless
classes <- get_classes(data)
sum = 0.0
n = 0.0
for(class in classes) {
for(col1 in 1:num_att) {
for(col2 in 1:num_att) {
if(col1 != col2) {
col1_data_class <- get_column_of_class(data, col1, class)
col2_data_class <- get_column_of_class(data, col2, class)
if(!is.numeric(col1_data_class)) {
col1_data_class <- replace_nominal_column(col1_data_class)
}
if(!is.numeric(col2_data_class)) {
col2_data_class <- replace_nominal_column(col2_data_class)
}
c <- tryCatch(cancor(col1_data_class, col2_data_class)$cor[1], error=function(err) NA)
if (!is.na(c)) {
sum <- sum + abs(c)
n <- n +1
}
}
}
}
}
sum / n
} else {
0.0
}
}
get_naive_bayes <- function(data) {
att_data <- get_att_data(data)
labels <- get_class_data(data)[,1]
model <- e1071::naiveBayes(att_data, labels)
predictions <- predict(model, att_data)
sum(predictions == labels) / get_num_samples(data)
}
get_linear_discriminant <- function(data) {
att_data <- get_att_data(data)
preprocess_att_data <- replace_nominal(att_data)
labels <- get_class_data(data)[,1]
model <- tryCatch(LdaClassic(preprocess_att_data, labels), error=function(err) NULL)
if(is.null(model)) {
NA
} else {
predictions <- predict(model, preprocess_att_data)@classification
sum(predictions == labels) / get_num_samples(data)
}
}
get_min_max_mean_sd <- function(x, name) {
m <- numeric(0)
m[paste(name, "min", sep="_")] = min(x)
m[paste(name, "max", sep="_")] = max(x)
m[paste(name, "mean", sep="_")] = mean(x)
s <- sd(x)
if(is.na(s)) {
s <- 0
}
m[paste(name, "sd", sep="_")] <- s
m
}
#######################################################################################
## Decision tree based measures ##
#######################################################################################
## Improved Dataset Characterisation for Meta-learning ##
## Yonghong Peng, Peter A. Flach, Carlos Soares and Pavel Brazdil ##
## Lecture Notes in Computer Science, 2002, Volume 2534/2002, 193-208 ##
## DOI: 10.1007/3-540-36182-0_14 ##
## [all 15 features] ##
#######################################################################################
## A higher-order approach to meta-learning ##
## H. Bensusan, C. Giraud-Carrier, and C. Kennedy. ##
## In Proceedings of the ECML'2000 Workshop on Meta- Learning: ##
## Building Automatic Advice Strategies for Model Selection and Method Combination ##
## pages 109-117, 2000. ##
## [only few features] ##
#######################################################################################
get_tree_properties <- function(data) {
f <- as.formula(paste(get_class_name(data), "~ ."))
tree_time <- R.utils::System$currentTimeMillis();
model <-rpart::rpart(f, data = data, method = "class", control=rpart::rpart.control(maxcompete=0, maxsurrogate=0))
tree_time <-R.utils::System$currentTimeMillis() - tree_time
leaves = rownames(model$frame[model$frame$var=="<leaf>",])
n_leaves = length(leaves)
n_nodes = sum(model$frame$var!="<leaf>")
branch_lengths = numeric(0)
for(leaf in leaves) {
branch_lengths <- c(branch_lengths, length(rpart::path.rpart(model, leaf, print.it=FALSE)[[leaf]]))
}
att_used = numeric(0)
for(att in get_att_names(data)) {
att_used <- c(att_used, sum(model$frame$var==att))
}
depths <- tree.depth(as.numeric(row.names(model$frame)))
level <- numeric(0)
for(d in 1:max(depths)) {
level <- c(level, sum(depths==d))
}
result <- numeric(0)
result["nodes"] <- n_nodes
result["leaves"] <- n_leaves
result["nodes_per_attribute"] <- n_nodes / get_num_att(data)
result["nodes_per_instance"] <- n_nodes / get_num_samples(data)
result["leaf_corrobation"] <- mean(model$frame$n[model$frame$var=="<leaf>"]) / get_num_samples(data)
result <- c(result, get_min_max_mean_sd(level, "level"))
result <- c(result, get_min_max_mean_sd(branch_lengths, "branch"))
result <- c(result, get_min_max_mean_sd(att_used, "attribute"))
time <- numeric(0)
time["tree_time"] <- tree_time
list(result=result, time=time)
}
get_decision_stump <- function(data, col) {
f <- as.formula(paste(get_class_name(data), "~", get_col_name(data, col)))
# model <-mvpart::rpart(f, data = data, method = "class", control=rpart.control(maxdepth=1))
model <- rpart::rpart(f, data = data, method = "class", control=rpart::rpart.control(maxdepth=1))
att_data <- get_att_data(data)
labels <- get_class_data(data)[,1]
predictions <- predict(model, att_data, type = "class")
sum(predictions == labels) / get_num_samples(data)
}
get_decision_stumps <- function(data) {
v <- numeric(0)
g <- numeric(0)
num_att <- get_num_att(data)
time <- R.utils::System$currentTimeMillis();
for(col in 1:num_att) {
v <- c(v, get_decision_stump(data, col))
f <- as.formula(paste(get_class_name(data), "~", get_col_name(data, col)))
#print(f)
g <- c(g, attrEval(f, data, estimator="GainRatio")[1])
}
time <- R.utils::System$currentTimeMillis() - time
# -- attribute with minimal gain ration --
#f <- as.formula(paste(get_class_name(data), "~ ."))
#gainRatios <- attrEval(f, data, estimator="GainRatio")
result <- get_min_max_mean_sd(v, "stump")
result['stump_min_gain'] <- v[which.min(g)]
result['stump_random'] <- v[sample(1:length(v), 1)]
list(result=result, time=time)
}
get_knn <- function(data, k, cv=TRUE) {
att_data <- get_att_data(data)
labels <- get_class_data(data)
if(cv) {
predictions <- FNN::knn.cv(att_data, labels[,], k=k, prob=FALSE, algorithm="kd_tree")
} else {
predictions <- FNN::knn(att_data, att_data, labels[,], k=k, prob=FALSE, algorithm="kd_tree")
}
sum(as.vector(predictions) == labels) / get_num_samples(data)
}
get_knns <- function(data, max_k) {
result <- numeric(0)
time <- R.utils::System$currentTimeMillis()
for(k in 1:max_k) {
result[paste("nn", k, sep="_")] = get_knn(data, k)
gc()
}
time <- R.utils::System$currentTimeMillis() - time
result["nn_sd"] <- sd(result)
list(result=result, time=time)
}
get_class_stats <- function(data) {
classes <- get_classes(data)
class_data <- get_class_data(data)
probs <- numeric(0)
for(class in classes) {
probs[class] <- sum(class_data == class) / n_row(data)
}
get_min_max_mean_sd(probs, "class_prob")
}
measure_landmark <- function(f, data, param) {
measure_landmark_p(f, data, NULL)
}
measure_landmark_p <- function(f, data, param) {
m = numeric(0)
time <- R.utils::System$currentTimeMillis()
if(is.null(param)) {
result <- f(data)
} else {
result <- f(data, param)
}
time <- R.utils::System$currentTimeMillis() - time
if(is.na(result)) {
time <- NA
}
list(result=result, time=time)
}
compute_landmarking <- function(data, data_preprocessed=NULL) {
if(is.null(data_preprocessed)) {
data_numeric <- replace_nominal_att(data)
data_preprocessed <- normalize(data_numeric)
}
naive_bayes <- measure_landmark(get_naive_bayes, data)
lda <- measure_landmark(get_linear_discriminant, data_preprocessed)
stumps <- get_decision_stumps(data)
knns <- get_knns(data_preprocessed, 1)
result <- numeric(0)
result["naive_bayes"] <- naive_bayes$result
result["lda"] <- lda$result
result <- c(result, stumps$result)
result <- c(result, knns$result)
time <- numeric(0)
time["naive_bayes_time"] <- naive_bayes$time
time["lda_time"] <- lda$time
time["stump_time"] <- stumps$time
time["nn_time"] <- knns$time
list(result=result, time=time)
}
# -------------------------------------------------------------------------------------------------
# -------------------------------------------------------------------------------------------------
combine <- function(x) {
result <- numeric(0)
for(name in names(x)) {
result <- c(result, x[[name]])
}
result
}
test_mf <- function(data) {
print(data)
}
compute_grouped_meta_features <- function(data) {
total_time <- R.utils::System$currentTimeMillis()
data_numeric <- replace_nominal_att(data)
data_preprocessed <- normalize(data_numeric)
time <- numeric(0)
simple <- numeric(0)
simple_time <- R.utils::System$currentTimeMillis()
simple['classes'] <- get_num_classes(data)
simple['attributes'] <- get_num_att(data)
simple['numeric'] <- get_num_numeric_att(data)
simple['nominal'] <- get_num_nominal_att(data)
simple['samples'] <- get_num_samples(data)
simple['dimensionality'] <- simple['attributes'] / simple['samples']
simple['numeric_rate'] <- simple['numeric'] / simple['attributes'] ## -- Feature Selection for Meta-Learning
simple['nominal_rate'] <- simple['nominal'] / simple['attributes'] ## -- Feature Selection for Meta-Learning
simple <- c(simple, get_symbol_stats(data))
simple <- c(simple, get_class_stats(data))
simple_time <- R.utils::System$currentTimeMillis() - simple_time
statistical <- numeric(0)
statistical_time <- System$currentTimeMillis()
statistical['skewness'] <- get_skewness(data)
statistical['skewness_prep'] <- get_skewness(data_preprocessed)
statistical['kurtosis'] <- get_kurtotis(data, type=1)
statistical['kurtosis_prep'] <- get_kurtotis(data_preprocessed, type=1)
statistical['abs_cor'] <- get_abs_cor(data)
statistical <- c(statistical, get_cancor(data, 1))
statistical <- c(statistical, get_fract(data, 1))
statistical_time <- System$currentTimeMillis() - statistical_time
inftheo <- numeric(0)
inftheo_time <- R.utils::System$currentTimeMillis()
inftheo['class_entropy'] <- get_class_entropy(data)
inftheo['normalized_class_entropy'] <- get_normalized_class_entropy(data)
inftheo['attribute_entropy'] <- get_attribute_entropy(data)
inftheo['normalized_attribute_entropy'] <- get_normalized_attribute_entropy(data)
inftheo['joint_entropy'] <- get_joint_entropy(data)
inftheo['mutual_information'] <- get_mutual_information(data)
inftheo['equivalent_attributes'] <- get_equivalent_number_of_attributes(data)
inftheo['noise_signal_ratio'] <- get_noise_signal_ratio(data)
inftheo_time <- R.utils::System$currentTimeMillis() - inftheo_time
modelbased <- get_tree_properties(data)
time <- c(time, modelbased$time)
modelbased <- modelbased$result
landmarking <- compute_landmarking(data, data_preprocessed=data_preprocessed)
time <- c(time, landmarking$time)
landmarking <- landmarking$result
total_time <- R.utils::System$currentTimeMillis() - total_time
time['simple_time'] <- simple_time
time['statistical_time'] <- statistical_time
time['inftheo_time'] <- inftheo_time
time['total_time'] <- total_time
obj = list(simple=simple, statistical=statistical, inftheo=inftheo, modelbased=modelbased,
landmarking=landmarking, time=time)
return(obj)
}
# -------------------------------------------------------------------------------------------------
# -------------------------------------------------------------------------------------------------
getStatlogFeatures = function(data) {
aux = compute_grouped_meta_features(data)
return(aux)
}
# -------------------------------------------------------------------------------------------------
# -------------------------------------------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.