Nothing
# hyperSMURF 2.0
# April 2018
library(randomForest);
library(unbalanced);
# Function to both oversample by SMOTE the minority class and undersample the majority class
# Input:
# data : data frame or matrix of data
# y : factor with the labels of the classes, 0 for the majority and 1 for the minority class
# fp : multiplicative factor for the SMOTE oversampling of the minority class (def. = 1)
# If n is the number of examples of the minority class, then fp*n new synthetic examples are generated
# ratio : ratio of the #majority/#minority (def.=1)
# k : number of the nearest neighbours for the SMOTE algorithm (def. = 5).
# Output:
# A list with two entries:
# - X: a data frame including the original minority class examples plus the SMOTE oversampled and undersampled data
# - Y: a factor with the labels of the data frame
smote_and_undersample <- function(data, y, fp=1, ratio=1, k=5) {
perc.over <- fp * 100;
perc.under <- (100.0 * ratio * (perc.over+100))/perc.over;
return (ubSMOTE(as.data.frame(data), y, perc.over = perc.over, k = k, perc.under = perc.under));
}
# Function to oversample by SMOTE the minority class
# Input:
# data : data frame or matrix of data including only the minority class
# fp : multiplicative factor for the SMOTE oversampling of the minority class (def=1).
# If n is the number of examples of the minority class, then fp*n new synthetic examples are generated
# If fp<1 no new data are generated and the original data set is returned
# k : number of the nearest neighbours for the SMOTE algorithm (def. = 5).
# Output:
# a data frame including the original minority class examples plus the SMOTE oversampled data
smote <- function(data, fp=1, k=5) {
if (fp<1)
return(data);
perc.over <- fp * 100;
y <- as.factor(rep(1,nrow(data)));
return (ubSMOTE(as.data.frame(data), y, perc.over = perc.over, k = k, perc.under = 0)$X);
}
# Performs a random partition of the indices
# Input:
# n.ex : The number of indices to be partitioned
# n.partitions : number of partitions
# seed : seed for the random generator
# Output:
# a list with n.partitions elements. Each element store the indices of the partition
do.random.partition <- function(n.ex, n.partitions, seed=0) {
if (seed!=0)
set.seed(seed);
part <- vector(mode="list", length=n.partitions);
m.per.part <- round(n.ex/n.partitions);
shuffled <- sample(1:n.ex);
start=1;
end=m.per.part;
for (i in 1:(n.partitions-1)) {
part[[i]] <- shuffled[start:end];
start <- start+m.per.part;
end <- end+m.per.part;
}
part[[n.partitions]] <- shuffled[start:n.ex];
return(part);
}
# hyperSMURF training
# Input:
# data : a data frame or matrix with the training data
# y : a factor with the labels. 0:majority class, 1: minority class.
# n. part : number of partitions
# fp : multiplicative factor for the SMOTE oversampling of the minority class
# ratio : ratio of the #majority/#minority
# k : number of the nearest neighbours for SMOTE
# ntree : number of trees of the rf
# mtry : number of the features to be randomly selected by the rf
# cutoff : a numeric vector of length 2. Cutoff for respectively the majority and minority class
# This parameter is meaningful when used with the thresholded version of hyperSMURF in the testing phase,
# i.e. with hyperSMURF.test.thresh
# seed : initialization seed for the random generator
# file : name of the file where the hyperSMURF model will be saved. If file=="" (def.) no model is saved.
# Output:
# A list of trained RF models. Each element of the list is a randomForest object.
hyperSMURF.train <- function (data, y, n.part=10, fp=1, ratio=1, k=5, ntree=10, mtry=5, cutoff=c(0.5,0.5), seed = 0, file="") {
data.min <- as.data.frame(data[y==1,]); # only data of the minority class
data <- data[-which(y==1),]; # only data of the majority class
gc();
n.data <- nrow(data);
rf.list <- vector(mode="list", n.part);
# Construct the random partitions of majority examples
part <- do.random.partition(n.data, n.partitions=n.part, seed=seed);
for (i in 1:n.part) {
# SMOTE oversampling
data.min.over <- smote(data.min, fp=fp, k=k);
n.data.min.over <- nrow(data.min.over);
# Majority undersampling and construction of the training set
n.maj <- ratio*n.data.min.over;
ind.part <- part[[i]];
# indices of the majority class
if (length(ind.part) >= n.maj)
indices.maj <- ind.part[1:n.maj]
else
indices.maj <- ind.part;
data.train <- rbind(data.min.over, data[indices.maj,]);
y <- as.factor(c(rep(1,n.data.min.over), rep(0, length(indices.maj))));
rf.list [[i]] <-randomForest(data.train, y, ntree = ntree, mtry = mtry, cutoff = cutoff);
rm(data.train, data.min.over); gc();
cat("Training of ensemble ", i, "done.\n");
}
if (file!="")
save(rf.list, file=file);
return(rf.list);
}
# hyperSMURF test
# Input:
# data : a data frame or matrix with the test data
# HSmodel: a list including the trained random forest models
# Output:
# a named vector with the computed probabilities for each example (HyeprSMURF score)
hyperSMURF.test <- function (data, HSmodel) {
n.models <- length(HSmodel);
n.ex <- nrow(data);
prob <- numeric(n.ex);
for (i in 1:n.models) {
prob <- prob + predict(HSmodel[[i]], data, type="prob")[,2];
gc();
}
prob <- prob/n.models;
names(prob) <- rownames(data);
return(prob);
}
# hyperSMURF test - thresholded version
# The threshold is embedded in the HSmodel according to the cutoff parameter set in the training phase.
# Input:
# data : a data frame or matrix with the test data
# HSmodel: a list including the trained random forest models
# Output:
# a named vector with the computed probabilities for each example (HyeprSMURF thresholded score)
hyperSMURF.test.thresh <- function (data, HSmodel) {
n.models <- length(HSmodel);
n.ex <- nrow(data);
prob <- numeric(n.ex);
for (i in 1:n.models) {
lab <- predict(HSmodel[[i]], data, type="response");
p <- ifelse(lab==1, 1, 0);
prob <- prob + p;
gc();
}
prob <- prob/n.models;
names(prob) <- rownames(data);
return(prob);
}
# hyperSMURF cross-validation
# Input:
# data : a data frame or matrix with the data
# y : a factor with the labels. 0:majority class, 1: minority class.
# kk : number of folds (def: 5)
# n. part : number of partitions
# fp : multiplicative factor for the SMOTE oversampling of the minority class
# If fp<1 no oversampling is performed.
# ratio : ratio of the #majority/#minority
# k : number of the nearest neighbours for SMOTE
# ntree : number of trees of the rf
# mtry : number of the features to be randomly selected by the rf
# cutoff : a numeric vector of length 2. Cutoff for respectively the majority and minority class
# This parameter is meaningful when used with the thresholded version of hyperSMURF (parameter thresh=TRUE)
# i.e. with hyperSMURF.test.thresh
# thresh : logical. If TRUE the thesholded version of hyperSMURF is exectuted (def: FALSE)
# seed : initialization seed for the random generator ( if set to 0(def.) no inizialization is performed)
# fold.partition: vector of size nrow(data) with values in interval [0,kk). The values indicate the fold of the cross validation of each example. If NULL (default) the folds are randomly generated.
# file : name of the file where the cross-validated hyperSMURF models will be saved. If file=="" (def.) no model is saved.
# Output:
# a vector with the cross-validated hyperSMURF probabilities (hyperSMURF scores).
hyperSMURF.cv <- function (data, y, kk=5, n.part=10, fp=1, ratio=1, k=5, ntree=10, mtry=5, cutoff=c(0.5,0.5), thresh=FALSE, seed = 0, fold.partition=NULL, file="") {
n.data <- nrow(data);
indices.positives <- which(y == 1) ;
scores <- numeric(n.data);
names(scores) <- rownames(data);
if (is.null(fold.partition)) {
cat("Creating new folds\n")
folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed);
} else {
cat("Using given folds\n")
folds <- do.stratified.cv.data.from.folds(1:n.data,indices.positives,fold.partition,k=kk);
}
for (i in 1:kk) {
# preparation of the training and test data
ind.test <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]);
ind.pool.pos <- integer(0);
ind.pool.neg <- integer(0);
for (j in 1:kk)
if (j!=i) {
ind.pool.pos <- c(ind.pool.pos, folds$fold.positives[[j]]);
ind.pool.neg <- c(ind.pool.neg, folds$fold.non.positives[[j]]);
}
data.train <- data[c(ind.pool.pos, ind.pool.neg),];
y.train <- as.factor(c(rep(1, length(ind.pool.pos)), rep (0, length(ind.pool.neg))));
# training
cat("Starting training on Fold ", i, "...\n");
HS <- hyperSMURF.train (data.train, y.train, n.part=n.part, fp=fp, ratio=ratio, k=k, ntree=ntree, mtry=mtry, cutoff=cutoff, seed = seed);
rm(data.train); gc();
# test
data.test <- data[ind.test,];
cat("Starting test on Fold ", i, "...\n");
if (thresh)
scores[ind.test] <- hyperSMURF.test.thresh(data.test, HS)
else
scores[ind.test] <- hyperSMURF.test(data.test, HS);
cat("End test on Fold ", i, ".\n");
rm(data.test);
if (file=="")
rm(HS)
else
HS.list <- c(HS.list, HS);
gc();
cat("Fold ", i, " done -----\n");
}
if (file != "")
save(HS.list, file);
return(scores);
}
######################################################################
# Function to generate data for the stratified cross-validation.
# Input:
# examples : indices of the examples (a vector of integer)
# positives: vector of integer. Indices of the positive examples. The indices refer to the indices of examples
# k : number of folds (def = 10)
# seed : seed of the random generator (def=0). If is set to 0 no initiazitation is performed
# Ouptut:
# a list with 2 two components
# - fold.non.positives : a list with k components. Each component is a vector with the indices of the non positive elements of the fold
# - fold.positives : a list with k components. Each component is a vector with the indices of the positive elements of the fold
# N.B.: in both elements indices refer to the values of the examples vector
do.stratified.cv.data <- function(examples, positives, k=10, seed=0) {
if (seed!=0)
set.seed(seed);
fold.non.positives <- fold.positives <- list();
for (i in 1:k) {
fold.non.positives[[i]] <- integer(0);
fold.positives[[i]] <- integer(0);
}
# examples <- 1:n;
non.positives <- setdiff(examples,positives);
# non.positives <- examples[-positives];
non.positives <- sample(non.positives);
positives <- sample(positives);
n.positives <- length(positives);
resto.positives <- n.positives%%k;
n.pos.per.fold <- (n.positives - resto.positives)/k;
n.non.positives <- length(non.positives);
resto.non.positives <- n.non.positives%%k;
n.non.pos.per.fold <- (n.non.positives - resto.non.positives)/k;
j=1;
if (n.non.pos.per.fold > 0)
for (i in 1:k) {
fold.non.positives[[i]] <- non.positives[j:(j+n.non.pos.per.fold-1)];
j <- j + n.non.pos.per.fold;
}
j.pos=1;
if (n.pos.per.fold > 0)
for (i in 1:k) {
fold.positives[[i]] <- positives[j.pos:(j.pos+n.pos.per.fold-1)];
j.pos <- j.pos + n.pos.per.fold;
}
if (resto.non.positives > 0)
for (i in k:(k-resto.non.positives+1)) {
fold.non.positives[[i]] <- c(fold.non.positives[[i]], non.positives[j]);
j <- j + 1;
}
if (resto.positives > 0)
for (i in 1:resto.positives) {
fold.positives[[i]] <- c(fold.positives[[i]], positives[j.pos]);
j.pos <- j.pos + 1;
}
return(list(fold.non.positives=fold.non.positives, fold.positives=fold.positives));
}
######################################################################
# Function to generate data for cross-validation from pre-computed folds
# Input:
# examples : indices of the examples (a vector of integer)
# positives: vector of integer. Indices of the positive examples. The indices refer to the indices of examples
# folds: vector of length equal to examples, with values in interval [0,kk). The value indicates the partition in the cross validation step of the class
# k : number of folds (def = 10)
# Ouptut:
# a list with 2 two components
# - fold.non.positives : a list with k components. Each component is a vector with the indices of the non positive elements of the fold
# - fold.positives : a list with k components. Each component is a vector with the indices of the positive elements of the fold
# N.B.: in both elements indices refer to the values of the examples vector
do.stratified.cv.data.from.folds <- function(examples, positives, folds, k=10) {
non.positives <- setdiff(examples,positives);
fold.non.positives <- fold.positives <- list();
for (i in 1:k) {
fold.non.positives[[i]] <- integer(0);
fold.positives[[i]] <- integer(0);
fold.positives[[i]] <- positives[folds[positives]==i-1];
fold.non.positives[[i]] <- non.positives[folds[non.positives]==i-1];
}
return(list(fold.non.positives=fold.non.positives, fold.positives=fold.positives));
}
######################################################################
# Function to generate synthetic imbalanced data
# A variable number of minority and majority class examples are generated. All the features of the majority class are distributed according
# to a gausian distributin with mean=0 and sd=1. Of the ovreall n.features n.inf. features of the minority class are distributed according to a gaussian centered in 1 with standard deviation sd.
# Input:
# n.pos: number of positive (minority clsss) examples (def. 100)
# n.neg: number of negative (majority class) examples (def. 2000)
# n.feaures: total number of features (def. 10)
# n.inf.features: number of informative features (def. 2)
# sd: standard deviation of the informative features (def.1)
# seed: intialization seed for the random number generator. If 0 (def) current clock time is used.
# Output:
# A list with two elements:
# data: the matrix of the synthetic data having pos+n.neg rows and n.features columns
# labels: a factor with the labels of he examples: 1 for minority and 0 for majority class.
# construction of a synthetic unbalanced data set
imbalanced.data.generator <- function(n.pos=100, n.neg=2000, n.features=10, n.inf.features=2, sd=1, seed=0) {
if (seed!=0)
set.seed(seed);
class0 <- matrix(rnorm(n.neg*n.features, mean=0, sd=1), nrow=n.neg);
class1 <-matrix(rnorm(n.pos*n.inf.features, mean=1, sd=sd), nrow=n.pos);
classr1<-matrix(rnorm(n.pos*(n.features-n.inf.features), mean=0, sd=1), nrow=n.pos);
class1 <- cbind(class1,classr1);
data <- rbind(class1,class0);
labels<-factor(c(rep(1,n.pos),rep(0,n.neg)), levels=c("1","0"));
return (list(data=data, labels=labels));
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.