R/data_balancing_funcs.R
In ROSE: Random Over-Sampling Examples

Documented in ovun.sample ROSE

#Last modified on 01/30/2014 
######################################################################
.onAttach <- function(libname,pkgname){
   packageStartupMessage("Loaded ROSE ", as.character(packageDescription("ROSE")[["Version"]]),"\n")
}
######################################################################

######################################################################
#ovun.sample main function
######################################################################
ovun.sample <- function(formula, data, method="both", N, p=0.5, subset=options("subset")$subset, na.action=options("na.action")$na.action, seed)
{

	###checks
		if( missing(formula) ) 
			stop("formula is reaquired.\n")

	method <- match.arg(method, choices=c("both", "under", "over"))
		if( !method%in%c("both", "over", "under") ) 
			stop("Method must be 'both', 'over', or 'under'.\n")
	###
	Call <- match.call()
	m <- match(c("formula", "data","method","N", "p", "seed", "subset", "na.action"), names(Call), 0L)
	Call1 <- Call[c(1L, m)]
	Call1[[1L]] <- omnibus.balancing
	res <- eval(Call1)
	out <- list(Call=match.call(), method=method, data=res$data)
	class(out) <- "ovun.sample"
	out
}

##print method for ovun.sample
print.ovun.sample <- function(x, ...) 
{
	cat("\n")
	cat("Call: \n")
	print(x$Call)
	Method <- switch(match.arg(x$method, choices=c("both", "under", "over")),
							both="combination of over- and under-sampling",
							under="undersampling",
							over="oversampling"
						 )
	cat("\n")
	cat("Data balanced by", Method,"\n")
	cat("\n")
	print(x$data)
}

###summary method for ovun.sample
summary.ovun.sample <- function(object, ...) 
{
	out <- list( Call=object$Call, Summary=summary(object$data), method=object$method )
	class(out) <- "summary.ovun.sample"
	out
}

###print method for summary ovun.sample
print.summary.ovun.sample <- function(x, ...) 
{
	cat("\n")
	cat("Call: \n")
	print(x$Call)
	cat("\n")

	Method <- switch(match.arg(x$method, choices=c("both", "under", "over")),
							both="combination of over- and under-sampling",
							under="undersampling",
							over="oversampling"
						 )

	cat("Summary of data balanced by", Method ,"\n")
	cat("\n")
	print(x$Summary)
}

######################################################################
##function that provides a formula with non tranformed variables only
######################################################################
##this function is NOT exported
adj.formula <- function(formula, data)
{
	if( missing(data) )
		frml.env <- environment(formula)
	else
		frml.env <- data

	formula <- terms(formula, data = frml.env)
	vars <- attr(formula, "variables")
	vars <- sapply(vars, function(x) paste(deparse(x,width.cutoff=500), collapse=' '))[-1L]
	#remove all characters before either ( or /
	vars <- sub("*.*[(/]","", vars)
	#remove all characters after either ^ or )
	vars <- sub("['^')].*","", vars)
	vars <- unique(vars)
	formula <- as.formula(paste(vars[1], "~", paste(vars[-1], collapse= "+")))
	attr(formula, "variables") <- vars
	formula
}


######################################################################
#This function is the wrapper for all the implemented data balaning remedies 
######################################################################
##this function is NOT exported
omnibus.balancing <- function(formula, data, method, subset, na.action, N, p=0.5, seed, hmult.majo=1, hmult.mino=1)
{

		if( missing(formula) ) 
			stop("formula is required\n")
		if(missing(method))
				method <- "both"
		if( (method=="under" | method=="over" ) & !missing(N) & !missing(p) )
			stop("Too many arguments. Need to specify either N or p.\n")

	formula.orig <- formula
	formula <- adj.formula(formula, data)

	if( missing(subset) )
		subset=options("subset")$subset
	if( missing(na.action) )
		na.action=options("na.action")$na.action

	flg.data <- 0
		if( !missing(data) )
		{
			lst.model.frame <- list(formula=formula, data=data, subset=subset, na.action=na.action)

				if( is.environment(data) )#| is.list(data) ) 
					flg.data <- 2
				else
					flg.data <- 1
		}
		else
			lst.model.frame <- list(formula=formula, data=NULL, subset=subset, na.action=na.action)

		if( formula.orig[[3]]!="." & eval(formula)!=formula.orig )
			warning("Transformations of variables are not allowed.\n New data have been generated by using non-transformed variables.\n ")

	mf <- do.call(model.frame, lst.model.frame)
	cn <- rownames( attributes( attributes(mf)$terms )$factors )
	data.st <- data.frame(mf)
	y <- data.st[, 1]
	X <- data.frame(data.st[,-1])

	n <- length(y)
	d <- NCOL(X)

	classy <- class(y)
	y <- factor(y)
	T <- table(y)
	classx <- sapply(as.data.frame(X), class)

	###checks
		if(n<2) 
			stop("Too few observations.\n")	

		if( length(T)>2 )
			stop("The response variable must have 2 levels.\n")
		else
			if( length(T)==1 )
				stop("The response variable has only one class.\n")

		if( p<0 | p>1 ) 
			stop("p must be in the interval 0-1.\n")

	###

	#identify which is the label associated to the majority and minority classes
	majoY  <- levels(y)[which.max(T)]
	minoY  <- levels(y)[which.min(T)]

	#identify the majority and minority class examples
	ind.mino <- which( y == minoY )
	ind.majo <- which( y == majoY )

		if( !missing(seed) ) 
			set.seed(seed)

	data.obj <- switch(method,
								 both = ou.sampl(n, N, p, ind.majo, majoY, ind.mino, minoY, classy, X),
								 over = over.sampl(n, N, p, ind.majo, ind.mino, majoY, minoY, y, classy, X),
								 under = under.sampl(n, N, p, ind.majo, majoY, ind.mino, minoY, y, classy, X),
								 rose = rose.sampl(n, N, p, ind.majo, majoY, ind.mino, minoY, y, classy, X, classx, d, T, hmult.majo, hmult.mino)
						)

	data.out <- data.obj$data.out
	ynew <- data.obj$ynew
	Xnew <- data.obj$Xnew

		#re-position columns
		if( !missing(data) & flg.data!=0 )
		{
				#put data frame names in the right order
				if(flg.data==1)
					colnames(data.out) <- colnames(data)[colnames(data)%in%cn]
				else
					colnames(data.out) <- attr(formula, "variables")[attr(formula, "variables")%in%cn] 

			#insert y
			indY <- colnames(data.out)==cn[1]
			data.out[, indY] <- ynew

			#see wether the order of the variables in formula is the same as in data. If no, swap columns according to the order in data
			swap.col <- order( pmatch( cn[-1], colnames(data.out)[!indY] ) )
			data.out[,!indY] <- Xnew[, (1:d)[swap.col] ]
		}
		else
		{
				if( length(cn)-1 < d )
					colnames(data.out) <- c(cn[1], colnames(X))
				else
					colnames(data.out) <- cn
		}

	list(data=data.out, call=match.call())
}

######################################################################
#Combination of over and under sampling
######################################################################
##this function is NOT exported
ou.sampl <- function(n, N, p, ind.majo, majoY, ind.mino, minoY, classy, X)
{

		if( missing(N) )
			N <- n
	#number of new minority class examples
	n.mino.new <- sum(rbinom(N, 1, p))
	#number of new majority class examples
	n.majo.new <- N-n.mino.new

	id.majo.new <- sample(ind.majo, n.majo.new, replace=TRUE)
	id.mino.new <- sample(ind.mino, n.mino.new, replace=TRUE)

	#create X
	Xnew <- data.frame(X[c(id.majo.new, id.mino.new),])
	#create  y
		if( classy%in%c("character", "integer", "numeric") )
			ynew <- as.vector( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), mode=classy )
		if( classy=="factor" )  
			ynew <- factor( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), levels=c(majoY, minoY) )

	data.out <- data.frame(ynew, Xnew)
	rownames(data.out) <- 1:N

	list(data.out=data.out, ynew=ynew, Xnew=Xnew)
}

######################################################################
#Under sampling
######################################################################
##this function is NOT exported
under.sampl <- function(n, N, p, ind.majo, majoY, ind.mino, minoY, y, classy, X)
{

	n.mino.new <- sum(y == minoY)

		if( missing(N) )
		{
				# Determination of N and n.majo in version 0.0.2
				if( p<n.mino.new/n ) 
					warning("non-sensible to specify p smaller than the actual proportion of minority class examples in the original sample.\n")
			#theoretical n.majo
			n.majo <- round( (1-p)*n.mino.new/p )
			#estimated n.majo
			n.majo.new <- sum( rbinom(n.mino.new+n.majo, 1, 1-p) )
			#final sample size
			N <- n.majo.new + n.mino.new
		}
		else
		{
			if(N<n.mino.new)
				stop("N must be greater or equal than the number of minority class examples.\n")
			else
				n.majo.new <- N-n.mino.new
		}

	id.mino.new <- ind.mino
	id.majo.new <- sample(ind.majo, n.majo.new, replace=FALSE)

	#create X
	Xnew <- data.frame(X[c(id.majo.new, id.mino.new),])
	#create  y
		if( classy%in%c("character", "integer", "numeric") )
			ynew <- as.vector( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), mode=classy )
		if( classy=="factor" )  
			ynew <- factor( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), levels=c(majoY, minoY) )

	data.out <- data.frame(ynew, Xnew)
	rownames(data.out) <- 1:N

	list(data.out=data.out, ynew=ynew, Xnew=Xnew)

}

######################################################################
#Over sampling
######################################################################
over.sampl <- function(n, N, p, ind.majo, ind.mino, majoY, minoY, y, classy, X)
{

	n.majo <- n.majo.new <- sum(y == majoY)
	n.mino <- n-n.majo

		if( missing(N) )
		{
				if( p<n.mino/n ) 
					warning("non-sensible to specify p smaller than the actual proportion of minority class examples in the original sample.\n")
				#theoretical n.mino
				n.mino <- round( p*n.majo/(1-p) )
				#estimated n.mino
				n.mino.new <- sum( rbinom(n.mino+n.majo, 1, p) )
				#final sample size
				N <- n.majo + n.mino.new
		}
		else
		{
				if(N<n)
					stop("N must be greater or equal than the actual sample size.\n")
				else
					n.mino.new <- N-n.majo
		}

	id.majo.new <- ind.majo
	id.mino.new <- sample(ind.mino, n.mino.new, replace=TRUE)

	#create X
	Xnew <- data.frame(X[c(id.majo.new, id.mino.new),])
	#create  y
		if( classy%in%c("character", "integer", "numeric") )
			ynew <- as.vector( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), mode=classy )
		if( classy=="factor" )  
			ynew <- factor( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), levels=c(majoY, minoY) )

	data.out <- data.frame(ynew, Xnew)
	rownames(data.out) <- 1:N

	list(data.out=data.out, ynew=ynew, Xnew=Xnew)

}

######################################################################
#Rose generation
######################################################################
rose.sampl <- function(n, N, p, ind.majo, majoY, ind.mino, minoY, y, classy, X, classx, d, T, hmult.majo, hmult.mino)
{
		# variables: must be numeric, integer or factor
		if( any( is.na( pmatch(classx, c( "numeric","integer","factor"), duplicates.ok = TRUE ) ) ) ) 
			stop("The current implementation of ROSE handles only continuous and categorical variables.\n")

		if( any(T < 2) ) 
			stop("ROSE needs at least two majority and two minority class examples.\n")

		if( missing(N) )
			N <- n
	#number of new minority class examples
	n.mino.new <- sum(rbinom(N, 1, p))
	#number of new majority class examples
	n.majo.new <- N-n.mino.new

	id.majo.new <- sample(ind.majo, n.majo.new, replace=TRUE)
	id.mino.new <- sample(ind.mino, n.mino.new, replace=TRUE)


	id.num  <- which(classx=="numeric" | classx=="integer")
	d.num   <- d-length( which(classx=="factor") )

	#create  X
	Xnew <- data.frame(X[c(id.majo.new, id.mino.new),])
		if(d.num > 0)  
		{
			 Xnew[1:n.majo.new, id.num] <- rose.real(X[,id.num], hmult=hmult.majo, n=length(ind.majo), q=d.num, ids.class=ind.majo, ids.generation=id.majo.new)
			 Xnew[(n.majo.new+1):N, id.num] <- rose.real(X[,id.num], hmult=hmult.mino, n=length(ind.mino), q=d.num, ids.class=ind.mino, ids.generation=id.mino.new)
		}

	#create  y
		if( classy%in%c("character", "integer", "numeric") )
			ynew <- as.vector( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), mode=classy )
		if( classy=="factor" )  
			ynew <- factor( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), levels=c(majoY, minoY) )

	data.out <- data.frame(ynew, Xnew)
	rownames(data.out) <- 1:N

	list(data.out=data.out, ynew=ynew, Xnew=Xnew)
}


######################################################################
#function to generate synthetic real data
######################################################################
##This function is NOT exported
rose.real <- function(X, hmult=1, n, q = NCOL(X), ids.class, ids.generation)
{
	X <- data.matrix(X)
	n.new <- length(ids.generation)
	cons.kernel <- (4/((q+2)*n))^(1/(q+4))

		if(q!=1)
			H <- hmult*cons.kernel*diag(apply(X[ids.class,], 2, sd), q)
		else
			H <- hmult*cons.kernel*sd(X[ids.class,])

	Xnew.num <- matrix(rnorm(n.new*q), n.new, q)%*%H
	Xnew.num <- data.matrix(Xnew.num + X[ids.generation,])
	Xnew.num
}

######################################################################
#Wrapper for ROSE
######################################################################
ROSE <- function(formula, data, N, p=0.5, hmult.majo=1, hmult.mino=1, subset=options("subset")$subset, na.action=options("na.action")$na.action, seed)
{
	mc <- match.call()
	obj <- omnibus.balancing(formula, data, subset, na.action, N, p, method="rose", seed, hmult.majo, hmult.mino)
	out <- list(Call=mc, method="ROSE", data=obj$data)
	class(out) <- "ROSE"
	out
}

##print method for ROSE
print.ROSE <- function(x, ...) 
{
	cat("\n")
	cat("Call: \n")
	print(x$Call)
	cat("\n")
	cat("Data balanced by", x$method,"\n")
	cat("\n")
	print(x$data)
}

###summary method for ROSE
summary.ROSE <- function(object, ...) 
{
	out <- list( Call=object$Call, Summary=summary(object$data) )
	class(out) <- "summary.ROSE"
	out
}

###print method for summary ROSE
print.summary.ROSE <- function(x, ...) 
{
	cat("\n")
	cat("Call: \n")
	print(x$Call)
	cat("\n")

	cat("Summary of data balanced by ROSE","\n")
	cat("\n")
	print(x$Summary)
}