Nothing
#Last modified on 01/30/2014
######################################################################
.onAttach <- function(libname,pkgname){
packageStartupMessage("Loaded ROSE ", as.character(packageDescription("ROSE")[["Version"]]),"\n")
}
######################################################################
######################################################################
#ovun.sample main function
######################################################################
ovun.sample <- function(formula, data, method="both", N, p=0.5, subset=options("subset")$subset, na.action=options("na.action")$na.action, seed)
{
###checks
if( missing(formula) )
stop("formula is reaquired.\n")
method <- match.arg(method, choices=c("both", "under", "over"))
if( !method%in%c("both", "over", "under") )
stop("Method must be 'both', 'over', or 'under'.\n")
###
Call <- match.call()
m <- match(c("formula", "data","method","N", "p", "seed", "subset", "na.action"), names(Call), 0L)
Call1 <- Call[c(1L, m)]
Call1[[1L]] <- omnibus.balancing
res <- eval(Call1)
out <- list(Call=match.call(), method=method, data=res$data)
class(out) <- "ovun.sample"
out
}
##print method for ovun.sample
print.ovun.sample <- function(x, ...)
{
cat("\n")
cat("Call: \n")
print(x$Call)
Method <- switch(match.arg(x$method, choices=c("both", "under", "over")),
both="combination of over- and under-sampling",
under="undersampling",
over="oversampling"
)
cat("\n")
cat("Data balanced by", Method,"\n")
cat("\n")
print(x$data)
}
###summary method for ovun.sample
summary.ovun.sample <- function(object, ...)
{
out <- list( Call=object$Call, Summary=summary(object$data), method=object$method )
class(out) <- "summary.ovun.sample"
out
}
###print method for summary ovun.sample
print.summary.ovun.sample <- function(x, ...)
{
cat("\n")
cat("Call: \n")
print(x$Call)
cat("\n")
Method <- switch(match.arg(x$method, choices=c("both", "under", "over")),
both="combination of over- and under-sampling",
under="undersampling",
over="oversampling"
)
cat("Summary of data balanced by", Method ,"\n")
cat("\n")
print(x$Summary)
}
######################################################################
##function that provides a formula with non tranformed variables only
######################################################################
##this function is NOT exported
adj.formula <- function(formula, data)
{
if( missing(data) )
frml.env <- environment(formula)
else
frml.env <- data
formula <- terms(formula, data = frml.env)
vars <- attr(formula, "variables")
vars <- sapply(vars, function(x) paste(deparse(x,width.cutoff=500), collapse=' '))[-1L]
#remove all characters before either ( or /
vars <- sub("*.*[(/]","", vars)
#remove all characters after either ^ or )
vars <- sub("['^')].*","", vars)
vars <- unique(vars)
formula <- as.formula(paste(vars[1], "~", paste(vars[-1], collapse= "+")))
attr(formula, "variables") <- vars
formula
}
######################################################################
#This function is the wrapper for all the implemented data balaning remedies
######################################################################
##this function is NOT exported
omnibus.balancing <- function(formula, data, method, subset, na.action, N, p=0.5, seed, hmult.majo=1, hmult.mino=1)
{
if( missing(formula) )
stop("formula is required\n")
if(missing(method))
method <- "both"
if( (method=="under" | method=="over" ) & !missing(N) & !missing(p) )
stop("Too many arguments. Need to specify either N or p.\n")
formula.orig <- formula
formula <- adj.formula(formula, data)
if( missing(subset) )
subset=options("subset")$subset
if( missing(na.action) )
na.action=options("na.action")$na.action
flg.data <- 0
if( !missing(data) )
{
lst.model.frame <- list(formula=formula, data=data, subset=subset, na.action=na.action)
if( is.environment(data) )#| is.list(data) )
flg.data <- 2
else
flg.data <- 1
}
else
lst.model.frame <- list(formula=formula, data=NULL, subset=subset, na.action=na.action)
if( formula.orig[[3]]!="." & eval(formula)!=formula.orig )
warning("Transformations of variables are not allowed.\n New data have been generated by using non-transformed variables.\n ")
mf <- do.call(model.frame, lst.model.frame)
cn <- rownames( attributes( attributes(mf)$terms )$factors )
data.st <- data.frame(mf)
y <- data.st[, 1]
X <- data.frame(data.st[,-1])
n <- length(y)
d <- NCOL(X)
classy <- class(y)
y <- factor(y)
T <- table(y)
classx <- sapply(as.data.frame(X), class)
###checks
if(n<2)
stop("Too few observations.\n")
if( length(T)>2 )
stop("The response variable must have 2 levels.\n")
else
if( length(T)==1 )
stop("The response variable has only one class.\n")
if( p<0 | p>1 )
stop("p must be in the interval 0-1.\n")
###
#identify which is the label associated to the majority and minority classes
majoY <- levels(y)[which.max(T)]
minoY <- levels(y)[which.min(T)]
#identify the majority and minority class examples
ind.mino <- which( y == minoY )
ind.majo <- which( y == majoY )
if( !missing(seed) )
set.seed(seed)
data.obj <- switch(method,
both = ou.sampl(n, N, p, ind.majo, majoY, ind.mino, minoY, classy, X),
over = over.sampl(n, N, p, ind.majo, ind.mino, majoY, minoY, y, classy, X),
under = under.sampl(n, N, p, ind.majo, majoY, ind.mino, minoY, y, classy, X),
rose = rose.sampl(n, N, p, ind.majo, majoY, ind.mino, minoY, y, classy, X, classx, d, T, hmult.majo, hmult.mino)
)
data.out <- data.obj$data.out
ynew <- data.obj$ynew
Xnew <- data.obj$Xnew
#re-position columns
if( !missing(data) & flg.data!=0 )
{
#put data frame names in the right order
if(flg.data==1)
colnames(data.out) <- colnames(data)[colnames(data)%in%cn]
else
colnames(data.out) <- attr(formula, "variables")[attr(formula, "variables")%in%cn]
#insert y
indY <- colnames(data.out)==cn[1]
data.out[, indY] <- ynew
#see wether the order of the variables in formula is the same as in data. If no, swap columns according to the order in data
swap.col <- order( pmatch( cn[-1], colnames(data.out)[!indY] ) )
data.out[,!indY] <- Xnew[, (1:d)[swap.col] ]
}
else
{
if( length(cn)-1 < d )
colnames(data.out) <- c(cn[1], colnames(X))
else
colnames(data.out) <- cn
}
list(data=data.out, call=match.call())
}
######################################################################
#Combination of over and under sampling
######################################################################
##this function is NOT exported
ou.sampl <- function(n, N, p, ind.majo, majoY, ind.mino, minoY, classy, X)
{
if( missing(N) )
N <- n
#number of new minority class examples
n.mino.new <- sum(rbinom(N, 1, p))
#number of new majority class examples
n.majo.new <- N-n.mino.new
id.majo.new <- sample(ind.majo, n.majo.new, replace=TRUE)
id.mino.new <- sample(ind.mino, n.mino.new, replace=TRUE)
#create X
Xnew <- data.frame(X[c(id.majo.new, id.mino.new),])
#create y
if( classy%in%c("character", "integer", "numeric") )
ynew <- as.vector( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), mode=classy )
if( classy=="factor" )
ynew <- factor( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), levels=c(majoY, minoY) )
data.out <- data.frame(ynew, Xnew)
rownames(data.out) <- 1:N
list(data.out=data.out, ynew=ynew, Xnew=Xnew)
}
######################################################################
#Under sampling
######################################################################
##this function is NOT exported
under.sampl <- function(n, N, p, ind.majo, majoY, ind.mino, minoY, y, classy, X)
{
n.mino.new <- sum(y == minoY)
if( missing(N) )
{
# Determination of N and n.majo in version 0.0.2
if( p<n.mino.new/n )
warning("non-sensible to specify p smaller than the actual proportion of minority class examples in the original sample.\n")
#theoretical n.majo
n.majo <- round( (1-p)*n.mino.new/p )
#estimated n.majo
n.majo.new <- sum( rbinom(n.mino.new+n.majo, 1, 1-p) )
#final sample size
N <- n.majo.new + n.mino.new
}
else
{
if(N<n.mino.new)
stop("N must be greater or equal than the number of minority class examples.\n")
else
n.majo.new <- N-n.mino.new
}
id.mino.new <- ind.mino
id.majo.new <- sample(ind.majo, n.majo.new, replace=FALSE)
#create X
Xnew <- data.frame(X[c(id.majo.new, id.mino.new),])
#create y
if( classy%in%c("character", "integer", "numeric") )
ynew <- as.vector( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), mode=classy )
if( classy=="factor" )
ynew <- factor( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), levels=c(majoY, minoY) )
data.out <- data.frame(ynew, Xnew)
rownames(data.out) <- 1:N
list(data.out=data.out, ynew=ynew, Xnew=Xnew)
}
######################################################################
#Over sampling
######################################################################
over.sampl <- function(n, N, p, ind.majo, ind.mino, majoY, minoY, y, classy, X)
{
n.majo <- n.majo.new <- sum(y == majoY)
n.mino <- n-n.majo
if( missing(N) )
{
if( p<n.mino/n )
warning("non-sensible to specify p smaller than the actual proportion of minority class examples in the original sample.\n")
#theoretical n.mino
n.mino <- round( p*n.majo/(1-p) )
#estimated n.mino
n.mino.new <- sum( rbinom(n.mino+n.majo, 1, p) )
#final sample size
N <- n.majo + n.mino.new
}
else
{
if(N<n)
stop("N must be greater or equal than the actual sample size.\n")
else
n.mino.new <- N-n.majo
}
id.majo.new <- ind.majo
id.mino.new <- sample(ind.mino, n.mino.new, replace=TRUE)
#create X
Xnew <- data.frame(X[c(id.majo.new, id.mino.new),])
#create y
if( classy%in%c("character", "integer", "numeric") )
ynew <- as.vector( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), mode=classy )
if( classy=="factor" )
ynew <- factor( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), levels=c(majoY, minoY) )
data.out <- data.frame(ynew, Xnew)
rownames(data.out) <- 1:N
list(data.out=data.out, ynew=ynew, Xnew=Xnew)
}
######################################################################
#Rose generation
######################################################################
rose.sampl <- function(n, N, p, ind.majo, majoY, ind.mino, minoY, y, classy, X, classx, d, T, hmult.majo, hmult.mino)
{
# variables: must be numeric, integer or factor
if( any( is.na( pmatch(classx, c( "numeric","integer","factor"), duplicates.ok = TRUE ) ) ) )
stop("The current implementation of ROSE handles only continuous and categorical variables.\n")
if( any(T < 2) )
stop("ROSE needs at least two majority and two minority class examples.\n")
if( missing(N) )
N <- n
#number of new minority class examples
n.mino.new <- sum(rbinom(N, 1, p))
#number of new majority class examples
n.majo.new <- N-n.mino.new
id.majo.new <- sample(ind.majo, n.majo.new, replace=TRUE)
id.mino.new <- sample(ind.mino, n.mino.new, replace=TRUE)
id.num <- which(classx=="numeric" | classx=="integer")
d.num <- d-length( which(classx=="factor") )
#create X
Xnew <- data.frame(X[c(id.majo.new, id.mino.new),])
if(d.num > 0)
{
Xnew[1:n.majo.new, id.num] <- rose.real(X[,id.num], hmult=hmult.majo, n=length(ind.majo), q=d.num, ids.class=ind.majo, ids.generation=id.majo.new)
Xnew[(n.majo.new+1):N, id.num] <- rose.real(X[,id.num], hmult=hmult.mino, n=length(ind.mino), q=d.num, ids.class=ind.mino, ids.generation=id.mino.new)
}
#create y
if( classy%in%c("character", "integer", "numeric") )
ynew <- as.vector( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), mode=classy )
if( classy=="factor" )
ynew <- factor( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), levels=c(majoY, minoY) )
data.out <- data.frame(ynew, Xnew)
rownames(data.out) <- 1:N
list(data.out=data.out, ynew=ynew, Xnew=Xnew)
}
######################################################################
#function to generate synthetic real data
######################################################################
##This function is NOT exported
rose.real <- function(X, hmult=1, n, q = NCOL(X), ids.class, ids.generation)
{
X <- data.matrix(X)
n.new <- length(ids.generation)
cons.kernel <- (4/((q+2)*n))^(1/(q+4))
if(q!=1)
H <- hmult*cons.kernel*diag(apply(X[ids.class,], 2, sd), q)
else
H <- hmult*cons.kernel*sd(X[ids.class,])
Xnew.num <- matrix(rnorm(n.new*q), n.new, q)%*%H
Xnew.num <- data.matrix(Xnew.num + X[ids.generation,])
Xnew.num
}
######################################################################
#Wrapper for ROSE
######################################################################
ROSE <- function(formula, data, N, p=0.5, hmult.majo=1, hmult.mino=1, subset=options("subset")$subset, na.action=options("na.action")$na.action, seed)
{
mc <- match.call()
obj <- omnibus.balancing(formula, data, subset, na.action, N, p, method="rose", seed, hmult.majo, hmult.mino)
out <- list(Call=mc, method="ROSE", data=obj$data)
class(out) <- "ROSE"
out
}
##print method for ROSE
print.ROSE <- function(x, ...)
{
cat("\n")
cat("Call: \n")
print(x$Call)
cat("\n")
cat("Data balanced by", x$method,"\n")
cat("\n")
print(x$data)
}
###summary method for ROSE
summary.ROSE <- function(object, ...)
{
out <- list( Call=object$Call, Summary=summary(object$data) )
class(out) <- "summary.ROSE"
out
}
###print method for summary ROSE
print.summary.ROSE <- function(x, ...)
{
cat("\n")
cat("Call: \n")
print(x$Call)
cat("\n")
cat("Summary of data balanced by ROSE","\n")
cat("\n")
print(x$Summary)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.