R/genericFunctions.R

Defines functions copulaLike subsampleFile OOBVotesScale reSMOTE modelingResiduals timeStampCore splitVarCore mergeOutliers intraClassesVariance interClassesVariance variance which.is.nearestCenter filterOutliers generic.cv model.stats rm.tempdir predictionvsResponses dates2numeric biasVarCov perspWithcol scale2AnyValues outsideConfIntLevels OOBquantiles bCI bCICore randomize pseudoHuberDist HuberDist L2.logDist L1Dist L2Dist lagFunction rollApplyFunction difflog simulationData setManyDatasets estimatePredictionAccuracy estimaterequiredSampleSize randomWhichMax expectedSquaredBias kappaStat fScore gMean someErrorType optimizeFalsePositives myAUC roc.curve keep.index outputPerturbationSampling overSampling generalization.error confusion.matrix generic.smoothing.log smoothing.log generic.log permuteCatValues rmNoise inDummies dummy.recode Id find.root rm.correlation getCorr min_or_max getOddEven extractYFromData define_train_test_sets init_values standardize_vect standardize filter.forest filter.object timer count.factor insert.in.vector2 insert.in.vector rmInAListByNames rm.InAList mergeLists randomCombination which.is.na na.missing pseudoNAReplace parallelNA.replace na.replace na.impute fillWith which.is.wholenumber is.wholenumber genericCbind sortDataframe sortMatrix as.true.matrix fillVariablesNames vector2matrix matrix2factor vector2factor rmInf NATreatment rmNA NAFeatures NAfactor2matrix matrix2factor2 factor2matrix which.is.factor factor2vector which.is.duplicate find.first.idx find.idx modX concat concatCore rm.string

Documented in bCI biasVarCov dates2numeric filterOutliers generic.cv init_values model.stats perspWithcol reSMOTE rmNA roc.curve simulationData

# <OWNER> = Saip CISS
# <ORGANIZATION> = QueensBridge Quantitative
# <YEAR> = 2014

# LICENSE 
# BSD 3-CLAUSE LICENSE

# Copyright (c) 2014, Saip CISS
# All rights reserved.

# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

# 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

# 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

# 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

# END OF LICENSE 

# remove characters
rm.string = function(string, which.char, exact = TRUE)
{
	string2 = vector()
	n = length(string)
	
	for (i in 1:n)
	{	
		string2[i] = strsplit(string[i], which.char, fixed = exact)	 
	
		if (length(string2[[i]]) == 2)
		{ string2[i] = paste(string2[[i]][1], string2[[i]][2], sep ="" ) }
	}
	
	return(unlist(string2))	
}

#concat
concatCore <- function(X)
{
	concatX = NULL
	for (i in 1:length(X))
	{	concatX =  paste(concatX, X[i], sep="") }
	return(concatX)
}

concat <- function(X) apply(X, 1, function(Z) concatCore(as.character(Z)))

# most frequent values
modX <- function(X) as.numeric(names(which.max(table(round(X,0)))))

# indexes
find.idx =  function(sample_ID_vector, ID_vect, sorting = TRUE, replacement = 0)
{
	if (sorting == TRUE)
	{	ghost_idx_file = match(sort(ID_vect), sort(sample_ID_vector))	}
	else
	{	ghost_idx_file = match(ID_vect, sample_ID_vector)	}
	
	return(which(!is.na(ghost_idx_file)))	
}

find.first.idx <-  function(X, sorting = TRUE)
{
	all_idx = which.is.duplicate(X, sorting = sorting)
	first_idx = unique(match(X[all_idx],X)) 
	return(first_idx)
}

which.is.duplicate <- function(X, sorting = TRUE) 
{
	if (sorting)  {	 X =sort(X)	}
	return(na.omit(which(duplicated(X) == TRUE)) )
}

# FACTORS AND CHARACTERS
factor2vector = function(d)
{
	if (is.factor(d))
	{
		Lev = levels(d);
		nb_Lev = length(Lev)
		{ 
			G = vector(length = length(d));
			if (is.numeric(Lev[1]))
			{
				if (as.numeric(Lev[1]) == 0)
				{	G = d-1	}
			}
			else
			{ 	G =cbind(d)	}
			d = G;
		}
		return(list(vector = as.numeric(d), factors = Lev) )
	}
	else
	{ return(list(vector = as.vector(d), factors = NA)) }
}

which.is.factor <- function(X, maxClasses = floor(0.01*min(3000, nrow(X))+2), count = FALSE) 
{
	p = ncol(X)
	n = nrow(X)
	values = rep(0,p)
	options(warn = -1)
	
	for (j in 1:p) 
	{ 
		uniqueValues = unique(X[,j])
		countValues <- length(uniqueValues)
				
		if (is.factor(X[,j]))
		{ 
			XLevels = levels(uniqueValues)
			checkThis = which(XLevels == "?")
			if (length(checkThis) > 0) { XLevels =  XLevels[-checkThis] }

			checkNA = which(is.na(XLevels))
			if (length(checkNA) > 0) { XLevels =  XLevels[-checkNA] }
			
			if (count) { values[j] = countValues }
			else {  values[j] = 1 	}	
		}
		else
		{
			if (!is.numeric(X[,j]))
			{  
				if (countValues <= maxClasses)
				{ 
					if (count) { values[j] = countValues  }
					else { 	values[j] = 1 }	
				}
			}
		} 	
	}
	names(values) = colnames(X)
	options(warn = 0)
	
	return(values)
}


# MATRIX
factor2matrix = function(X, threads = "auto")
{
	if (is.factor(X) || is.vector(X))
	{ return(as.true.matrix(factor2vector(X)$vector)) }
	else
	{
		np = dim(X)
		if  (max(np) < 1000)
		{			
			for (j in 1:np[2])
			{	
				if (is.character(X[,j]))	{ X[,j] = as.factor(X[,j]) }
				
				X[,j] = factor2vector(X[,j])$vector	
			}
			return(as.true.matrix(X))
		}
		else
		{
			max_threads = detectCores(logical = FALSE)	
			
			if (threads == "auto")
			{	
				if (max_threads == 2) { threads = max_threads }
				else {	threads  = max(1, max_threads - 1)  }
			}
			else
			{
				if (max_threads < threads) 
				{	
					maxLogicalThreads = detectCores()
					if (maxLogicalThreads < threads)
					{ cat("Warning : number of threads indicated by user was higher than logical threads in this computer.\n") }
				}
			}
			
			Cl = makePSOCKcluster(threads, type = "SOCK")
			registerDoParallel(Cl)
			chunkSize  <-  ceiling(np[2]/getDoParWorkers())
			smpopts  <- list(chunkSize  =  chunkSize)
			Z = matrix(NA, np[1], np[2])			
			Z <- foreach(j =1:np[2], .export = "factor2vector", .options.smp = smpopts, .combine = cbind) %dopar%
			{
				if (is.character(X[,j]))	{ X[,j] = as.factor(X[,j]) }
				factor2vector(X[,j])$vector	
			}
			
			colnames(Z) <- colnames(X)
			stopCluster(Cl)
			
			return(as.true.matrix(Z))
		}
	}
}

matrix2factor2 <- function(X, maxClasses = 10)
{
	idx = which.is.factor(X,maxClasses = maxClasses)
	factors = which(idx != 0)
	if (length(factors > 0))
	{	for (i in 1:length(factors)) {	X[,factors[i]] = as.factor(X[,factors[i]])	}	}
	else 
	{ cat("No variable found as categorical. Please try to increase number of classes.\n") }
	return(X)
}


NAfactor2matrix <- function(X, toGrep = "")
{
	if (length(toGrep) > 1)
	{
		for (i in 1:length(toGrep))
		{  
			toGrepIdx = NULL
			toGrepIdx = rbind(toGrepIdx, which(X == toGrep[i], arr.ind = TRUE))				
		}
	}
	else
	{	
		if (toGrep == "anythingParticular")	{ toGrepIdx = integer(0) }
		else  { toGrepIdx = which(X == toGrep, arr.ind = TRUE)  }		
	}
	
	NAIdx = which(is.na(X), arr.ind = TRUE)
	
	X <- factor2matrix(X)
	
	if (length(dim(toGrepIdx)[1]) > 0) { X[toGrepIdx] = NA	}
	if (length(dim(NAIdx)[1]) > 0) {  X[NAIdx] = NA	  }
	
	return(X)
}

NAFeatures <- function(X) apply(X,2, function(Z) { if (length(rmNA(Z)) == 0) { 1 } else{ 0 } })

rmNA = function(X)
{
	NAIdx = which(is.na(X))	
	if (length(NAIdx) > 0) { return(X[-NAIdx])	}
	else { return(X) }
}

NATreatment <- function(X, Y, na.action = c("veryFastImpute", "fastImpute", "accurateImpute", "omit"), regression = TRUE)
{
	p = ncol(X)
	featuresWithNAOnly = which(NAFeatures(X) == 1)
	if (length(featuresWithNAOnly) == p) { stop("Data have only NA values") }
	if (length(featuresWithNAOnly) > 0) 
	{ 
		X = X[,-featuresWithNAOnly] 
		cat("\nVariables with NA values only have been found and removed.\n")
	}
	NAvalues = which(is.na(X), arr.ind = TRUE)
	NAInputs = NAvalues[,1]
	matchNA = (length(NAInputs) > 0)		
	if ( (length(unique(NAInputs)) > (nrow(X) - 30)) && (na.action[1] == "omit") ) 
	{ stop("Too much missing values in data. Please impute missing values in order to learn the data.\n") }		
	if (is.factor(Y)) levelsY = levels(Y)
	if (!regression && !is.factor(Y) ) { Y = as.factor(Y); levelsY = levels(Y) }	
	
	NALabels = which(is.na(Y))
	matchNALabels = (length(NALabels) > 0)		
	if ( (length(NALabels) > (length(Y) - 30)) && (na.action[1] == "omit") )	{ stop("Too much missing values in responses.\n") }			
	if (matchNA || matchNALabels)
	{
		if (!is.null(Y))
		{
			if (matchNALabels) { newY = Y }
			else  { newY <- as.vector(NAfactor2matrix(as.numeric(Y))) }
			if (na.action[1] != "omit") 
			{ 
				cat("NA found in data. Imputation (fast or accurate, depending on option) is used for missing values.\nIt is strongly recommended to impute values outside modelling, using one of many available models.\n")					
				XY <- na.impute(cbind(X, newY), type = na.action[1])
				Y <- XY[,p+1]
				X <- XY[,-(p+1)]
				if (!regression) {  Y = as.factor(Y); levels(Y) = levelsY }
				rm(XY)				
			}
			else
			{
				if (matchNALabels && matchNA) {	rmRows = unique(c(NAInputs, NALabels))	}
				else
				{
					rmRows = NULL
					if (matchNA) {	rmRows = c(rmRows, unique(NAInputs))	}						
					if (matchNALabels) 	{	rmRows = c(rmRows,unique(NALabels))	}
				}					 
				X = X[-rmRows,]
				Y = Y[-rmRows]
			}
		}
		else
		{
			cat("\nIf accuracy is needed, it is strongly recommended to impute values outside modelling, using one of many available models.\n")
			if (na.action[1] != "omit") { 	X <- na.impute(X, type = na.action[1])	}
			else
			{		
				if (matchNA) { X <- X[-NAInputs,] 	}		
			}
		}
	}
		
	return(list(X = X, Y = Y))
}		


rmInf = function(X)
{
	InfIdx = which((X == Inf) | (X == -Inf))	
	if (length(InfIdx) > 0) { return(X[-InfIdx])	}
	else { return(X) }
}	

vector2factor = function(V) as.factor(V)


matrix2factor = function(X, which.columns) 
{
	X =	as.data.frame(X)
	for (j in which.columns)
	{	X[,j] = as.factor(X[,j])	}
	
	return(X)
}

vector2matrix <- function(X, nbcol) if (is.vector(X)) { matrix(data = X, ncol = nbcol) } else { X }

fillVariablesNames <- function(X)
{
	if(is.null(colnames(X))) 
	{  
		varNames = NULL
		for (i in 1:ncol(X)) { varNames = c(varNames,paste("V", i, sep="")) }
		colnames(X) = varNames
	}
	
	return(X)
}

as.true.matrix = function(X)
{
	if (is.matrix(X))
	{ return(X) }
	else
	{
		if (is.vector(X))
		{	
			names_X = names(X)
			X = t(as.numeric(as.vector(X)))
			if (!is.null( names_X ) ) { colnames(X) = names_X	}		
		}
		else
		{
			if (is.data.frame(X))
			{	
				names_X = colnames(X)
				X = as.matrix(X)  	
				col_X = ncol(X)				
				X = mapply( function(i) as.numeric(factor2vector(X[,i])$vector), 1:col_X)
				if (is.vector(X)) { X = t(X) }				
				if (!is.null(names_X)) { colnames(X) = names_X	}		
			}		
		}
		return(X)
	}
}

sortMatrix = function(X, which.col, decrease = FALSE)
{
	if (is.character(which.col))
	{ 	which.col = which( colnames(X) == which.col)	}
	
	return(X[order(X[,which.col], decreasing = decrease),])	
}

sortDataframe <- function(X, which.col, decrease = FALSE)
{
	if (is.character(which.col))
	{ 	which.col = which( colnames(X) == which.col)	}

	idx = order(X[,which.col], decreasing = decrease)
	
	return(X[idx,])
}	
	
genericCbind <- function(X,Y, ID = 1)
{
	matchIdx = match(X[,ID],Y[,ID])
	
	if (all(is.na(matchIdx)))
	{  	stop("Matrix do not share same values in 'ID'.\n") 	}
	else
	{
		naIdx = which(is.na(matchIdx))
		
		Z = matrix(data = 0, ncol = ncol(Y), nrow = nrow(X))
		colnames(Z) = colnames(Y)
		
		Z[which(!is.na(matchIdx)),] = Y[na.omit(matchIdx),]
		Z[naIdx,ID] = X[naIdx,ID]
		
		newZ = cbind(X,Z[,-1])

		return(newZ)
	}
}
	
is.wholenumber <- function(x, tol = .Machine$double.eps^0.5)  abs(x - round(x)) < tol

which.is.wholenumber <- function(X)  which(is.wholenumber(X))

fillWith <- function(X, toPut = "NA")
{
	for (j in 1:ncol(X))
	{ 
		if ( is.factor(X[,j]) || is.character(X[,j]) )
		{
			X[,j]= as.character(X[,j])
			otheridx = which(X[,j] == "")
			
			if (length(otheridx) > 0)
			{
				levelsLength = length(levels(X[,j]))
				levels(X[,j])[levelsLength]= toPut 
				X[otheridx,j] = toPut
			}
		}
	}
	
	return(X)
}

na.impute <- function(X, type = c("veryFastImpute", "fastImpute", "accurateImpute"))
{
	if (type[1] != "accurateImpute")
	{	
		if (type[1] == "veryFastImpute")
		{	return(na.replace(X, fast = TRUE))	}
		else
		{ 	return(na.replace(X, fast = FALSE))	}
	}
	else
	{ 	return(fillNA2.randomUniformForest(X)) 	}
}



# sample replacement (good and automatic, but prefere na.impute for accuracy)
na.replace = function(X, fast = FALSE)
{
	na.matrix = which(is.na(X), arr.ind = TRUE)
	n = nrow(X)
	factors <- which.is.factor(X)
	
	if (is.null(dim(na.matrix))) 
	{ return(X)	}
	else
	{ 
		p = unique(na.matrix[,2])
		for (j in seq_along(p))
		{
			i.idx = na.matrix[which(na.matrix[,2] == p[j]),1]
			
			if (!fast)
			{
				subLength <- length(rmNA(X[,p[j]]))
				for (i in seq_along(i.idx))
				{  
					if (factors[p[j]] == 1)	{  X[i.idx[i],p[j]] <- modX(sample(rmNA(X[,p[j]]),subLength, replace = TRUE)) 	}
					else
					{
						while(is.na(X[i.idx[i],p[j]])) 	{ 	X[i.idx[i],p[j]] <-  mean(rmNA(X[sample(1:n,sample(1:n,n)),p[j]])) 	}
						
						if ( length(which.is.wholenumber(rmNA(X[,p[j]]))) == subLength ) 	
						{  X[i.idx[i],p[j]] =  floor(X[i.idx[i],p[j]]) }
					}
				}
			}
			else
			{	
				if (factors[p[j]] == 1)
				{ 	X[i.idx ,p[j]] = modX(rmNA(X[,p[j]]))	}
				else
				{	X[i.idx,p[j]] = median(rmNA(X[,p[j]]))	}
			}
		}
		
		return(X)
	}
}


parallelNA.replace <- function(X, threads = "auto", parallelPackage = "doParallel", fast = FALSE)
{
	p = ncol(X)
	# code parallelization
	{
		export = "na.replace"	
		max_threads = detectCores(logical = FALSE)		
		if (threads == "auto")
		{	
			if (max_threads == 2) { threads = max_threads }
			else {	threads = max(1, max_threads - 1)  }
		}
		else
		{
			if (max_threads < threads) 
			{	
				maxLogicalThreads = detectCores()
				if (maxLogicalThreads < threads)
				{ 
					cat("Warning : number of threads indicated by user was higher than logical threads in this computer.") 
				}
			}
		}
				
		{
			Cl = makePSOCKcluster(threads, type = "SOCK")
			registerDoParallel(Cl)
		}
	}
		
	X <- foreach(i = 1:p, .export = export, .inorder = TRUE, .combine = cbind) %dopar%
	{  na.replace(X, fast = fast)	}
	
	return(X)
}


pseudoNAReplace = function(X, toGrep = "NA")
{
	
	p =ncol(X)
	for (j in 1:p)
	{
		if( ( is.character(X[,j]) || is.factor(X[,j]) ) )
		{
			X[,j] = as.character(X[,j])
			idx = which(X[,j] == toGrep)
			
			if (length(idx) > 0)
			{
				factorTable = table(X[,j])
				factorTableNames = names(factorTable)
				factorTableNames = factorTableNames[-which(factorTableNames == toGrep)]
				
				sampleIdx = sample(seq_along(factorTableNames),length(idx), replace = TRUE)
				X[idx, j] = factorTableNames[sampleIdx]
			}
		}
	}
	
	return(X)
}
	

na.missing = function(X, missing.replace = 0)
{
	na.data = which(is.na(X))
	X[na.data] = missing.replace
	return(X)
}

which.is.na = function(X) 
{
	if (!is.vector(X))
	{ 	X = factor2vector(X)$vector }
	
	return(which(is.na(X)))
}

# random combination : only multiple of 2 variables
randomCombination <- function(X, combination = 2, weights = NULL)
{
	howMany = ifelse(length(combination) > 1, length(combination), combination)
	n = nrow(X); p = ncol(X)
	L.combination = length(combination)
	if (L.combination > 1)
	{  
		var.samples = combination
		if (is.null(weights))
		{	var.weights = replicate(L.combination/2, sample(c(-1,1),1)*runif(1))	}
		else
		{	var.weights =  weights }
			
		X.tmp = matrix(NA, nrow(X), L.combination/2)
		colnames(X.tmp)= rep("V", ncol(X.tmp))
		idx = 1
		for (j in 1:(L.combination/2))
		{	
			X.tmp[,j] = var.weights[j]*X[,combination[idx]] + (1 - var.weights[j])*X[,combination[idx+1]]
			idx = idx + 2
		}
		
		idx = 1
		for (i in 1:ncol(X.tmp)) 
		{ 
			colnames(X.tmp)[i] = paste("V", combination[idx], "x", combination[idx+1], sep="") 
			idx = idx + 2
		}
		X.tmp = cbind(X, X.tmp)
		
	}
	else
	{ 
		var.samples = sample(1:p, 2*howMany)	
		var.weights = replicate(howMany, sample(c(-1,1),1)*runif(1))
		var.weights = cbind(var.weights, 1 - var.weights)

		X.tmp = X
		idx = 1
		for (j in 1:(length(var.samples)/2))
		{	
			X.tmp[,var.samples[idx]] = var.weights[j,1]*X[,var.samples[j]] + var.weights[j,2]*X[,var.samples[idx+1]]
			idx = idx + 2
		}
	}
	
	return(X.tmp)	
}
	

# LISTS
mergeLists <- function(X, Y, add = FALSE, OOBEval = FALSE)
{
	n = length(X)
	if (length(Y) != n)
	{ stop("lists size not equal.\n") }
	else
	{ 	
		Z = vector('list', n)
		names(Z) = names(X)
		for (i in 1:n)
	    {    
			if (add)
			{ 	Z[[i]] = X[[i]] + Y[[i]]  }
			else
			{   
				if (OOBEval)
				{
					if (is.matrix(X[[i]]))
					{	
						pX = ncol(X[[i]]); pY = ncol(Y[[i]])
						if ( pX == pY  )	
						{	
							if (nrow(X[[i]]) == nrow(Y[[i]])) {	Z[[i]] = cbind(X[[i]],Y[[i]])	}
							else {	Z[[i]] = rbind(X[[i]],Y[[i]])	}
						}
						else
						{
							if ( nrow(X[[i]]) == nrow(Y[[i]]) )	{	Z[[i]] = cbind(X[[i]],Y[[i]])	}
							else 
							{	
								pSamples <- (pX - pY)
								
								if (pSamples < 0)
								{	
									newM <- matrix(apply(X[[i]], 1, function(Z) sample(Z, abs(pSamples), replace= TRUE)), ncol = abs(pSamples))
									XX = cbind(X[[i]],newM)
									YY = Y[[i]]
								}
								else
								{
									newM <- matrix(apply(Y[[i]], 1, function(Z) sample(Z, pSamples, replace= TRUE)), ncol = pSamples)
									YY = cbind(Y[[i]], newM)
									XX = X[[i]]
								}								
															
								ZZ = rbind(XX,YY)
								newM2 <- matrix(apply(ZZ, 1, function(Z) sample(Z, min(pX, pY), replace= TRUE)), ncol = min(pX, pY))
							
								Z[[i]] = cbind(ZZ, newM2)
							
							}
						}							
					}
					else
					{	
						if ( (is.null(X[[i]])) && (is.null(Y[[i]])) ) { Z[[i]] = NULL }
						else {	Z[[i]] = c(X[[i]],Y[[i]])	}
					}
				}
				else
				{
					if (is.matrix(X[[i]])) 	{	Z[[i]] = rbind(X[[i]],Y[[i]])  }
					else 
					{		
						if ( (is.null(X[[i]])) && (is.null(Y[[i]])) ) { Z[[i]] = NULL }
						else {	Z[[i]] = c(X[[i]],Y[[i]])	}					
					}
				}
			}
		}
		names(Z) = names(X)
		return(Z)
	}
}


rm.InAList = function(X,idx)
{ 	
	tmp.X = X
	nb.list =length(X)
	new.X = vector('list',nb.list - length(idx))
	j = 1
	
	for (i in (1:nb.list)[-idx])
	{ 
		new.X[[j]] = X[[i]]
		names(new.X)[j] = names(X)[i] 
		j = j + 1
	}
	
	return(new.X)
}

rmInAListByNames <- function(X, objectListNames)
{
	for (i in 1:length(objectListNames))
	{
		idx = which(names(X) == objectListNames[i])
		
		if (length(idx) > 0)  { X = rm.InAList(X,idx) }
	}
	
	return(X)
}
		

# INSERT
insert.in.vector = function(X, new.X, idx)
{
	Y = vector(length = (length(X) + length(idx)) )
	n = length(Y)
	flag = 0;
	if ( idx <= (length(X) + 1) )
	{
		if (idx == (length(X) + 1))
		{	Y = c(X,new.X)	}
		else
		{		
			for (i in 1:n)
			{
				if(i == idx)
				{	Y[i] = new.X; Y[(i+1):n] = X[i:length(X)]; flag = 1; }
				else
				{	
					if (flag == 0)
					{	Y[i] = X[i]	}
				}
			}
		}
		return(Y)
	}
	else
	{	return(X)	}
}

insert.in.vector2 = function(X,new.X, idx)
{
	if  ( (length(idx) == 0) || (length(new.X) == 0) )
	{	return(X)	}
	else
	{
		Y = vector(length = (length(X) + length(idx)) )
		n = length(Y)
		for (j in 1:length(idx))
		{	Y[idx[j]] = new.X[j] 	}
		Y[-idx] = X
			
		return(Y)
	}
}


#count values 
count.factor <- function(X) 
{
	Xtable = table(X)
	values  = names(Xtable)
	row.names(Xtable) = NULL
	nb = Xtable
	return(list(values = values, numbers = nb))
}

# timer
timer = function(f)
{
	T1 = proc.time()#Sys.time()
	object = f
	T2 = proc.time()
	return(list(time.elapse = T2-T1, object = object))
}

filter.object = function(X)
{
	if (is.list(X))
	{
		if (!is.null(X$time.elapse)) {  return(X$object)	}
		else { return(X) }
	}
	else
	{ return(X)	}
}

filter.forest = function(X)
{
	if (is.list(X))
	{
		if (!is.null(X$time.elapse)) 
		{  
			if (!is.null(X$object$forest)) 	{	return(X$object$forest)	}
			else { (X$object) }
		}
		else 
		{ 
			if (!is.null(X$forest))
			{	return(X$forest) }
			else
			{	return(X)	}
		}
	}
	else
	{ return(X)	}
}

# standardize between [0,1]  or [-1,1]
standardize = function(X)
{
	if (is.vector(X))
	{	Y = standardize_vect(X,sign(min(X)  +0.00001))	}
	else
	{
		L = nrow(X); C = ncol(X);
		Y = matrix( data = NA, nrow = L , ncol = C)
		for (i in 1:C)
		{	Y[,i] =  standardize_vect(X[,i],sign(min(X[,i]) +0.00001))	}
		
		if (!is.null((colnames(X)[1])))
		{ colnames(Y) = colnames(X)	}
	}
	return(Y)
}

standardize_vect = function(X,neg)
{
	if (length(unique(X))  > 1)
	{
		n = length(X);
		max_X = max(X)
		min_X = min(X)
		Y = (X - min_X )/(max_X - min_X );
		if (neg == -1)
		{
			pos_neg = which (X < 0 )
			if ( sum(pos_neg) > 0)
			{	Y[pos_neg] = ( -X[pos_neg] + max_X )/( min_X - max_X );	}
			Y = Y[1:n];
		}
		return(Y)
	}
	else
	{ return(X) }
}


# init random train/test samples of any size
init_values <- function(X, Y = NULL, sample.size = 0.5, data.splitting = "ALL", unit.scaling = FALSE, scaling = FALSE, regression = FALSE)
{
	set.seed(sample(10000,1))
	if (is.vector(X)) {  n = length(X) } else { n = nrow(X) }
	indices = define_train_test_sets(n,sample.size,data.splitting)
	
	if (unit.scaling)
	{	X = standardize(X)	}
	
	if (scaling)
	{   X = scale(X); if (!is.null(Y) && regression) { Y = scale(Y) }	}
	
	
	if (data.splitting == "train")
	{
		xtrain = if (is.vector(X)) { X[indices$train] } else { X[indices$train,] }
		if (!is.null(Y))
		{ 	ytrain = Y[indices$train]	}
		else
		{  ytrain = Y	}
		
		return( list(xtrain = xtrain, ytrain = ytrain, train_idx = indices$train ) )
	}
	else
	{
		if (data.splitting == "test")
		{
			xtest =if (is.vector(X)) { X[indices$test] } else { X[indices$test,] }
			if (!is.null(Y))
			{ 	ytest = Y[indices$test]	}
			else
			{   ytest = Y	}
					
			return( list( xtest = xtest, ytest = ytest, test_idx = indices$test ) )
		}
		else
		{
			xtrain = if (is.vector(X)) { X[indices$train] } else { X[indices$train,] }
			xtest = if (is.vector(X)) { X[indices$test] } else { X[indices$test,] }
			
			if (!is.null(Y))
			{	ytrain = Y[indices$train]; 	ytest = Y[indices$test]	}
			else
			{   ytrain = ytest = Y	}
						
			return( list(xtrain = xtrain, ytrain = ytrain, xtest = xtest, ytest = ytest, train_idx = indices$train, 
				test_idx = indices$test) )
		}
	}
}

define_train_test_sets = function(N,sample.size,data.splitting)
{
	set.seed(sample(100,1))
	if (data.splitting == "train")
	{	
		v = sample(N)
		Ntrain = floor(N*sample.size)
		train_indices = v[1:Ntrain]
	
		return( list(train=train_indices) )
	}
	else
	{
		v = sample(N)
		Ntrain = floor(N*sample.size)
		train_indices = v[1:Ntrain]
		test_indices = v[(Ntrain+1):N]
	
		return( list(train = train_indices, test = test_indices) )
	}
}

extractYFromData <- function(XY, whichColForY = ncol(XY) )
{
	if (is.character(whichColForY))
	{	return(list( X = XY[,-which(colnames(XY) == whichColForY)], Y = XY[,whichColForY])) }
	else
	{   return(list( X = XY[,-whichColForY], Y = XY[,whichColForY])) }
}

getOddEven = function(X, div = 2)
{
	even = which( (X%%div) == 0)
	odd = which(  (X%%div) == 1)
	
	if (length(even) > 0)
	{	X.even = X[even]	}
	else
	{	X.even = 0	}
	
	if (length(odd) > 0)
	{	X.odd = X[odd]	}
	else
	{	X.odd = 0	}
	
	return(list(even = X.even, odd = X.odd))
}

min_or_max = function(X)
{
	s =sample (c(0,1),1)
	if (s)
	{ return(min(X))	}
	else
	{ return(max(X))	}

}

# Correlations 
getCorr = function(X, mid = 1/3, high= 2/3)
{
	if ( high <= mid)
	{  mid = 0.5*high	}
	
	zeros =vector()
	for (j in 1:ncol(X))
	{ zeros[j] = length(which(X[,j] == 0))/nrow(X)	}
	
	corr.matrix = cor(X)
	diag(corr.matrix) = 0
		
	high_correl = which( abs(corr.matrix) >= high, arr.ind = TRUE )
	mid_correl = which( (abs(corr.matrix) >= mid) & (abs(corr.matrix) < high), arr.ind = TRUE )
	low_correl = which( abs(corr.matrix) < mid, arr.ind = TRUE )
	
		
	if  (dim(high_correl)[1] > 0)  
	{
		high.correl.values.idx = find.first.idx(corr.matrix[high_correl])
		high.correl.values = corr.matrix[high_correl]
		var_name = cbind(high_correl, round(high.correl.values,4))[high.correl.values.idx ,]
		
		if (is.vector(var_name))
		{ var_name = as.matrix(t(var_name))	}
		
		var_name2 = var_name[match(sort(unique(var_name[,2])), var_name[,2]),]
		
		if (is.vector(var_name2))
		{ var_name2 = as.matrix(t(var_name2)) }
		
		if (!is.null(colnames(X))) 
		{	var_name = cbind(colnames(X)[var_name2[,1]], colnames(X)[var_name2[,2]], var_name2[,3])		}
		else
		{  var_name = var_name2	}
	}
	else
	{	var_name = var_name2 = paste("no correlation > ",high)	} 
		
	
	score = list(corr.matrix = corr.matrix, high.correl = high_correl, mid.correl = mid_correl, low.correl = low_correl, 
		high.correl.idx = var_name2, high.correl.var.names = var_name, zeros = round(zeros,6));
		
	return(score)
}


rm.correlation = function(X, corr.threshold = 2/3, zeros.threshold = 0.45, repeated.var = FALSE, suppress = TRUE)
{
   
    correlation.levels = getCorr(X, high = corr.threshold) 
      
    if (is.character(correlation.levels$high.correl.idx)) 
	{	return("lower correlation for all variables")	}
	else
	{	
		L1 = correlation.levels$zeros[ correlation.levels$high.correl.idx[,1]]
		L2 = correlation.levels$zeros[ correlation.levels$high.correl.idx[,2]]
		
		zeros.idx = which( ((L1+L2)/2 < zeros.threshold) | ((L1+L2) > (1+zeros.threshold)) )
		
		if ( length(zeros.idx) > 0)
		{	
			rm.idx = count.factor(correlation.levels$high.correl.idx[zeros.idx,1])
			
			if (repeated.var)
			{	rm.idx = as.numeric(rm.idx$values[which(rm.idx$numbers > 1)])	}
			else
			{ 	rm.idx = as.numeric(rm.idx$values)	}	
									
			if (is.null(colnames(X)))
			{	var_name = "no colnames"	}
			else
			{ 	var_name = colnames(X)[rm.idx]  }
					
			if (suppress)
			{	X = X[,-rm.idx]  }	
			
			high.corr.matrix = correlation.levels$high.correl.var.names	
		
			object.list = list(X = X, var_name = var_name, corr.var = rm.idx, high.corr.matrix = high.corr.matrix)
		}
		else
		{ 	object.list = "no available true correlation because of zeros. Try increase threshold"	}
		
		return(object.list)		
	}
}

find.root = function(f,a,b,...)
{
	while ((b-a) > 0.0001 )
	{
		c = (a+b)/2
		if ((f(b,...)*f(c,...)) <= 0)
		{	a = c }
		else
		{ b = c }
	}
	return(list(a=a, b=b))
}

Id <- function(X) X 

# Treatment of categorical variables
dummy.recode = function(X, which.col, threshold = 0.05, allValues = TRUE)
{
	if (is.numeric(which.col))
	{	names.X = colnames(X)[which.col]	}
	else
	{  names.X = which.col	}
	
	X = X[,which.col]
	Y = NULL
	n = length(X)
	cf.X = count.factor(X)
	cat.values = as.numeric(cf.X$values)
	cat.numbers =  as.numeric(cf.X$numbers)
	
	if (length(cat.numbers) == 1)
	{  X = as.matrix(X); colnames(X)[1] = names.X; return(X) }
	else
	{		
		merge.values.idx = which(cat.numbers <  threshold*n)
		
		if (length(merge.values.idx) > 0)
		{ 	
			cat.values[merge.values.idx]  =  cat.values[merge.values.idx[1]] 
			cat.values = unique(cat.values)
			tmp.cat.numbers =  sum(cat.numbers[merge.values.idx])
			cat.numbers[merge.values.idx] = 1
			cat.numbers = unique(cat.numbers)
			cat.numbers[which(cat.numbers == 1)] = tmp.cat.numbers
		}
				
		dummy.l = length(cat.values) + (allValues - 1)
		tmp.X = vector(length = length(X))
		tmp.names = names.X 
		
		if (dummy.l == 0)
		{	X = rep(1, length(X));  X = as.matrix(X); colnames(X)[1] = names.X; return(X)  }
		else
		{
			for (i in 1:(dummy.l))	
			{
				idx = which(X == cat.values[i] )
				if (length(merge.values.idx) > 0)
				{
					if (cat.values[i] == merge.values.idx[1])
					{ 
						for (j in 2:length(merge.values.idx))
						{	idx = c(idx, which(X == merge.values.idx[j]))	}
					}		
				}
				tmp.X[idx] =  1
				tmp.X[-idx] = 0		
				Y = cbind(Y,tmp.X)	
				tmp.names[i] = paste(names.X,".", cat.values[i],sep = "")
			}	

			colnames(Y) = tmp.names
			return(Y)
		}		
	}
}

inDummies <- function(X, dummies, inDummiesThreshold = 0.05, inDummiesAllValues = TRUE )
{
	X.dummies = NULL
	for (j in 1:length(dummies)) {	X.dummies = cbind(X.dummies, dummy.recode(X, dummies[j], threshold = inDummiesThreshold, allValues = inDummiesAllValues)) }
	
	if (is.numeric(dummies[1])) {  return(cbind(X[,-dummies], X.dummies)) }
	else {   return(cbind(X[,-match(dummies, colnames(X))], X.dummies))	}
}

rmNoise =  function(X, catVariables = NULL, threshold = 0.5, inGame = FALSE, columnOnly = TRUE)
{
	n = nrow(X)
	if (is.null(catVariables)) {   catVariables = 1:ncol(X)	}
	
	if (inGame)
	{
		for (j in catVariables)
		{
			idxInf = which( table(X[,catVariables[j]])/n < threshold )
			if (length(idxInf) > 0)
			{ 
				for (i in 1:length(idxInf))
				{	
					rmIdx = which(X[,j] == as.numeric(names(idxInf[i])))
					if (length(rmIdx) > 0)	{ 	X = X[-rmIdx,]	}
				}
			}			
			n = nrow(X)
		}
	}
	else
	{
		rmIdxAll = NULL
		for (j in catVariables)
		{
			idxInf = which( table(X[,catVariables[j]])/n > threshold )
			if (length(idxInf) > 0)
			{ 
				if (columnOnly)
				{	rmIdxAll = c(rmIdxAll, catVariables[j])	  }
				else
				{
				
					for (i in 1:length(idxInf))
					{	
						rmIdx = which(X[,j] == as.numeric(names(idxInf[i])))
						if (length(rmIdx) > 0)	{ 	rmIdxAll = c(rmIdxAll, rmIdx)	}
					}
				}
			}			
		}
		
		if (!is.null(rmIdxAll))	{ rmIdxAll = unique(rmIdxAll)	}
		if (columnOnly)	{  X = X[,-rmIdxAll] }
		else {	X = X[-rmIdxAll,]	}
	}
	
	return(X)
}


# category2Quantile <- function(X, nQuantile = 20)
# {
	# Z = X
	
	# for (j in 1:ncol(X))
	# {
		# limQ = min(X[,j])
		# for (i in 1:nQuantile)
		# {	
			# Q = quantile(X[,j], i/nQuantile)
			# Z[which(Z[,j] <= Q & Z[,j] >= limQ), j] = i-1
			# limQ = Q	
		# }
	
	# }
	# return(Z)
# }

# category2Proba <- function(X,Y, whichColumns = NULL, regression = FALSE, threads = "auto")
# {
	# if (is.null(whichColumns)) {  whichColumns = 1:ncol(X) }
	# p = length(whichColumns)
	# n = nrow(X)
	# ghostIdx = 1:n
	# Z = X
	
	# for (j in  whichColumns)
	# {
		# XX = cbind(X[,j],ghostIdx)
		# NAIdx = which.is.na(X[,j])
		# if (length(NAIdx) > 0)
		# {	
			# catValues = rmNA(unique(X[-NAIdx,j])) 				
			# for (i in seq_along(catValues))
			# {
				# idx = which(X[-NAIdx,j] == catValues[i])
				# trueIdx = XX[-NAIdx,j][idx]
				# if (!regression)
				# {	Z[trueIdx, j] = max(table(Y[trueIdx]))/length(idx)	}
				# else
				# {	Z[trueIdx, j] = sum(Y[trueIdx])/length(idx) }
			# }
		# }
		# else
		# {
			# catValues = unique(X[,j])
			# for (i in seq_along(catValues))
			# {
				# idx = which(X[,j] == catValues[i])
				# if (!regression)
				# { 	Z[idx, j]= max(table(Y[idx]))/length(idx)	} #Z[idx ,j]
				# else
				# {	Z[idx,j] = sum(Y[idx])/length(idx) }
			# }
		# }
	# }
	# return(Z)
# }
		
# imputeCategoryForTestData <- function(Xtrain.imputed, Xtrain.original, Xtest, impute = TRUE)
# {
	# Xtest.imputed = matrix(NA, ncol = ncol(Xtrain.imputed), nrow = nrow(Xtest))
	# for (j in 1:ncol(Xtrain.original))
	# {
		# uniqueCat = unique(Xtrain.original[,j])
		# for( i in 1:length(uniqueCat) )
		# {
			# uniqueIdx = which(Xtest[,j] == uniqueCat[i])
			
			# if (length(uniqueIdx) > 0)
			# {
				# uniqueIdx2 = which(Xtrain.original[,j] == uniqueCat[i])[1]
				# Xtest.imputed[uniqueIdx,j] = Xtrain.imputed[uniqueIdx2,j]
			# }
		# }
	# }
	
	# if (impute)
	# {	
		# if (length(which.is.na(Xtest.imputed)) > 0)
		# {	Xtest.imputed = na.impute(Xtest.imputed) }
	# }
		
	# return(Xtest.imputed)
# }
			
# categoryCombination <- function(X, Y, idxBegin = 1, idxtoRemove = NULL)
# {
	# seqCol = (1:ncol(X))[-c(idxBegin, idxtoRemove)]
	# Z = matrix(data = 0, ncol = length(seqCol), nrow = nrow(X))
	# l = 1
	# for (j in seqCol)
	# {
		# matchSameCat = which(X[,idxBegin] == X[,j])
		# if (length(matchSameCat) > 0)
		# {
			# Z[matchSameCat, l] = max(table(rmNA(Y[matchSameCat])))/length(matchSameCat)
			# l = l + 1
		# }
	# }
	
	# return(Z)
# }
		
permuteCatValues = function(X, catVariables)
{
	if (is.vector(X))
	{ X = as.matrix(X); catVariables = 1 }

	XX = X 
		
	for (j  in catVariables)
	{
		tmpX = Z = X[,j]
		XTable  = count.factor(Z)
		XCat = as.numeric(XTable$values)
		XNb = XTable$numbers
		
		nRepCat = length(XCat)
		randomCat = sample(XCat, nRepCat)
		
		for (i in 1:nRepCat)
		{
			if (XNb[i] > 0)
			{
				oldIdx = which(Z == XCat[i])
				tmpX[oldIdx] = randomCat[i]
			}
		}
		XX[,j] = tmpX
	}
	
	return(XX)
}

generic.log <- function(X, all.log = FALSE)
{
	if (all.log)
	{
		for (j in 1:ncol(X)) { 	X[,j] = log(X[,j] + abs(min(X[,j])) +1) }
	}
	else
	{
		for (j in 1:ncol(X)) { 	X[,j] = if (min(X[,j]) >= 0) { log(X[,j] + 1) } else { X[,j] } }
	}
	return(X)
}

smoothing.log = function(X, filtering = median, k = 1)
{
	zeros = which(X == 0)
	if (length(zeros) > 0)
	{
		min.X = filtering(X[-zeros])
		replace.zeros = runif(length(zeros),0, k*min.X)
		X[zeros] = replace.zeros 
	}
	
	return(X)
	
}

generic.smoothing.log = function(X, threshold = 0.05, filtering = median ,  k = 1)
{
	if (is.vector(X) )
	{	n  = length(X);  p = 1; X =	as.matrix(X)	}
	else
	{ 	p = ncol(X); n = nrow(X) }
	
	Z = X
	
	for (j in 1:p)
	{ 
		min_X = min(X[,j])
		if ( (max(X[,j]) > 0) && ( min_X < 0) )
		{	
			neg.idx = which(X[,j] < 0)
			pos.idx = which(X[,j] < 0)
			
			if (length(neg.idx) <= threshold*n)
			{	X[neg.idx,j] =	0;  Z[,j] = log(smoothing.log(abs(X[,j]), filtering = median ,  k = 1)) 	}
		
			if (length(pos.idx) <= threshold*n)
			{	X[pos.idx,j] =	0;  Z[,j] = log(smoothing.log(abs(X[,j]), filtering = median ,  k = 1)) 	}
		}
		else
		{	
			
			if ( (min(X[,j]) != 0) && (max(X[,j]) != 0) )
			{			
				if ( (min(X[,j]) == 0) || (max(X[,j]) == 0) )
				{	Z[,j] = log(smoothing.log(abs(X[,j]), filtering = median,  k = 1))	}
			}
		}
	}
	
	return(Z)
}


confusion.matrix = function(predY, Y)
{
	if ( (is.vector(predY) && !is.vector(Y)) || (!is.vector(predY) && is.vector(Y)) ) 
	stop("Predictions and responses do not have the same class.")
	if (length(predY) != length(Y)) stop("Predictions and responses do not have the same length.")
	if (inherits(Y, "factor") || inherits(predY, "factor"))  
	{  
		trueClasses = levels(as.factor(Y))
		predY = as.numeric(predY);  Y = as.numeric(Y)
		classes = 1:length(trueClasses)
	}
	else
	{
		trueClasses = 1:length(tabulate(Y))
		predClasses = 1:length(tabulate(predY))
		if (length(predClasses) > length(trueClasses)) 	{  trueClasses = predClasses }
		classes = trueClasses
	}
	
	n = length(Y)
	k = length(classes)
	TP = vector(length = length(classes))
		
	if (k == 2)
	{ 	FP = vector(length = length(classes))	}
	else
	{ 	FP = matrix( data = 0, ncol = k, nrow = k ) } 
	
	for (i in 1:k)
	{
		TP[i] = length(which((predY == classes[i]) & (Y == classes[i])))
	   
	    if (k > 2)
	    {
			out.classes = which(classes != i)
			FP[i,i] = TP[i]
			for (j in 1:(k-1))
			{	  FP[i,out.classes[j]] =  length(which((predY == classes[i]) & (Y == classes[out.classes[j]])))	}
		}
		else
		{	FP[i] = length(which((predY == classes[i]) & (Y !=classes[i])))		}
		
	}
		
	if (k == 2)
	{	ConfMat = matrix(data = cbind(TP, FP) , nrow = k, ncol = k); ConfMat[2,] = t(c(FP[2],TP[2])) 		}
	else
	{	ConfMat = FP	}
	
	rownames(ConfMat) = trueClasses
	colnames(ConfMat) = trueClasses
	class.error = round(1- diag(ConfMat)/rowSums(ConfMat),4)
	
	ConfMat = cbind(ConfMat,class.error)
	names(dimnames(ConfMat)) = c("Prediction", "Reference")

	return(ConfMat)
}


generalization.error = function(conf.matrix) 1 - sum(diag(conf.matrix[,-ncol((conf.matrix))]))/sum(conf.matrix[,-ncol((conf.matrix))])

overSampling = function(Y, which.class = 1, proportion = 0.5)
{
	n = length(Y)
	S1 <- find.idx(which.class, Y, sorting = FALSE)
	if (proportion == -1)
	{	S1.newSize = length(Y[-S1])	}
	else
	{	S1.newSize = floor((1 + proportion)*length(S1)) }
	
	S1.sample = sample(S1, S1.newSize, replace = ifelse(length(S1) > S1.newSize, FALSE, TRUE) ) 
	S2 = (1:n)[-S1]
	if (proportion == -1)
	{	S2.sample = S2 }
	else
	{ 	S2.sample = sample(S2, length(S2) + length(S1) - S1.newSize, replace = TRUE)  }#
			
	return( list(C1 = sort(S1.sample), C2 = sort(S2.sample)) )
}

outputPerturbationSampling = function(Y, whichClass = 1, sampleSize = 1/4, regression = FALSE)
{
	n = length(Y)
	
	if (regression)
	{
		randomIdx =  sample(1:n, min(n, floor(sampleSize*n) + 1), replace = TRUE)
		meanY <- sum(Y[randomIdx])/length(Y[randomIdx])
		minY <- min(Y)
		maxY <- max(Y)
		
		sdY <- if (sum(abs(Y[randomIdx])) == 0) { 0.01 } else { max(0.01, sd(Y[randomIdx])) }
			
			
		randomData = rnorm(length(randomIdx), meanY, 3*sdY)
		if ( (minY < 0) && (maxY > 0))  Y[randomIdx] = randomData
				
		if ( (minY >= 0) && (maxY > 0)) Y[randomIdx] = abs(randomData) 
		
		if ( (minY < 0 ) && (maxY <= 0)) Y[randomIdx] = -abs(randomData) 
	}
	else
	{	
		if (whichClass == -1)	{ 	whichClass = 1	}
		
		idxClass = which(Y == whichClass)
		
		perturbClassidx = sample(idxClass, floor(min(sampleSize,0.5)*length(idxClass)))
		perturbClassY  = sample(Y[which(Y !=  whichClass)], length(perturbClassidx), replace = TRUE)
		Y[perturbClassidx] = perturbClassY
	}
	
	return(Y)
}


keep.index <- function(X,idx) if (!is.null(dim(X))) { (1:nrow(X))[idx] } else  { (1:length(X))[idx] }

roc.curve <- function(X, Y, classes, positive = classes[2], ranking.threshold = 0, ranking.values = 0, falseDiscoveryRate = FALSE, plotting = TRUE,  printValues = TRUE, Beta = 1)
{
	par(bg = "white")
	classes = sort(classes)
	
	if (length(classes) != 2)
	{ stop("Please provide two classes for plotting ROC curve.") }
	
	if (is.vector(X) && (!is.numeric(X)) && !(is.list(X)) )
	{	X <- vector2factor(X) }
	
	if (is.factor(X))
	{ 	X <- factor2vector(X)$vector	}
	
	if (is.list(X))
	{
		if (inherits(X,"randomUniformForest")) # class(X) == "randomUniformForest"
		{	
			if (!is.null(X$predictionObject))
			{	X <- X$predictionObject	}
			else
			{   X <- X$forest }
		}		
	}
		
	YObject = factor2vector(vector2factor(Y))
	
	Y = YObject$vector
	newClasses = rep(0,2)
	newClasses[1] = find.idx(classes[1], YObject$factors)
	newClasses[2] = find.idx(classes[2], YObject$factors)
	positive = newClasses[2]
	classes = newClasses	
	
	if (falseDiscoveryRate)
	{		
		if (is.list(X))
		{
			class.object = if (!is.null(X$all.votes)) { majorityClass(X$all.votes, classes) } 
			else { majorityClass(X$OOB.votes, classes) }  
			pred.classes = if (!is.null(X$majority.vote)) { X$majority.vote } else { X$OOB.predicts }
			class.probas = (class.object$class.counts/rowSums(class.object$class.counts))[,1]
		}
		else
		{  
			class.probas = 0.5; 
			pred.classes = X 
			
			if (length(X) != length(Y))
			{ stop("X is not a vector, or length of X and Y are not the same.\n") }
		}
	}
	else
	{ 	
		pred.classes = if (is.numeric(X)) { X } 
		else 
		{  
			if (!is.null(X$all.votes))	{ X$majority.vote }
			else  {	X$OOB.predicts }	
		}
		class.probas = ranking.values;   
		ranking.threshold = 0 
	}
		
	pos.idx = which(pred.classes == positive & class.probas >=  ranking.threshold )
	n_pos.idx = length(pos.idx)
	
	if (n_pos.idx == 0)
	{
		cat("\nNo positive case found.\n")
		return(list(cost = 0, threshold = ranking.threshold, accuracy = 0, expected.matches = 0))
	}
	else
	{
		conf.mat = confusion.matrix(pred.classes,Y)
		n2 = diag(conf.mat[,-ncol(conf.mat)])[nrow(conf.mat)] # true positives
		n1 = length(pos.idx) - n2		# false positives
		n3 = conf.mat[1,2]	# false negatives
		n4 = conf.mat[1,1]   # true negatives
		
		inv_n23 = 1/(n2 + n3)
		inv_n12 = 1/(n1 + n2)
		if (falseDiscoveryRate)
		{
			VP = array(0, n_pos.idx+1)
			FP = array(1, n_pos.idx+1)

			for(i in 1:n_pos.idx)
			{
				if (pred.classes[pos.idx[i]] == Y[pos.idx[i]])	{ VP[i+1] = VP[i] + inv_n23; FP[i+1] = FP[i] }
				else  {	FP[i+1] = FP[i] - inv_n12;  VP[i+1] = VP[i] }
			}
			FP.new = c(FP,0)
			VP.new = c(VP,1)
			
			n_pos = length(VP.new)
			aupr = rep(0,n_pos)
			for (j in 2:n_pos)
			{	
				DV = (VP.new[j] - VP.new[j-1])
				DF = (FP.new[j-1] - FP.new[j] )
				#DF = (1-FP.new[j] - (1-FP.new[j-1]))
				aupr[j] = { (1 -  VP.new[j])*DF + 0.5*DV*DF }
			}
			aupr = 1- sum(aupr)
		}
		else
		{
			# for ROC curve : VP rate = Sensitivity = VP/(VP + FN) ;  specificity = VN/(VN + FP) = 1 - FP rate 
			# and Sensitivity = True positives among all; Specificity = True negatives among all
			VP = FP = array(0, n_pos.idx+1)
			inv_n14 = 1/(n1 + n4)
			for(i in 1:n_pos.idx)
			{
				if(pred.classes[pos.idx[i]] == Y[pos.idx[i]] )
				{	VP[i+1] = VP[i] + inv_n23; FP[i+1] = FP[i]	}
				else
				{	FP[i+1] = FP[i] + inv_n14;  VP[i+1] = VP[i] }
			}
			FP.new = c(FP,1)
			VP.new = c(VP,1)
			AUC.est = round(pROC::auc(Y, pred.classes ),4)
		}
			
		if (plotting)
		{
			if (ranking.threshold == 0)
			{	
				F1.est = fScore(conf.mat, Beta = Beta)
				
				if(!falseDiscoveryRate)
				{
					plot(FP.new, VP.new, xlab = "False Positive rate [1 - Specificity]", ylab = "Sensitivity [True Positive rate]", 
					type='l', col = sample(1:10,1), lwd = 2)
					title(main = "ROC curve", sub = paste("AUC: ",  AUC.est, ". Sensitivity: ", round(100*VP.new[length(pos.idx)+1],2), "%",
					". False Positive rate: ", round(100*FP.new[length(pos.idx)+1],2), "%", sep=""), col.sub = "blue", cex.sub = 0.85)
					points(c(0,1), c(0,1), type='l', col='grey')
					abline(v = FP.new[length(pos.idx)+1], col='purple', lty = 3)
					abline(h = VP.new[length(pos.idx)+1],col='purple', lty = 3)
					if (printValues) { cat("AUC: ",  AUC.est, "\n") }
				}
				else
				{
					plot(VP.new, FP.new, xlab = "Sensitivity [True Positive rate]", ylab = "Precision [1 - False Discovery rate ]", 
					type='l', col = sample(1:10,1), lwd = 2)
					title(main = "Precision-Recall curve", sub = paste("\nProbability of positive class", " > ", 0.5, ". Precision: ", 
					round(100*FP.new[length(pos.idx)+1],2), "%", ". Sensitivity: ", round(100*VP.new[length(pos.idx)+1],2), "%",  ". F", Beta, "-Score: ", round(F1.est,4), ". AUPR: ", round(aupr, 4), sep = ""), col.sub = "blue", cex.sub = 0.85)
					points(c(0,1), c(1,0), type='l', col='grey')
					abline(v = VP.new[length(pos.idx)+1], col='purple', lty = 3)
					abline(h = FP.new[length(pos.idx)+1], col='purple', lty = 3)
					if (printValues) 
					{
						cat("AUPR (Area Under the Precision-Recall curve) :", round(aupr,4), "\n")
						cat("F1 score: ", round(F1.est,4) , "\n")
					}
				}
				#grid()
				if (printValues) { print(conf.mat) }
			}
			else
			{	
				pred.classes[-pos.idx] = classes[-which(classes == positive)]
				conf.mat = confusion.matrix(pred.classes,Y)
				F1.est = fScore(conf.mat)
				if (printValues) 
				{
					cat("F1 ", F1.est, "\n")
					print(conf.mat)
				}
				plot(VP.new, FP.new, xlab = "Sensitivity [1 - True positives Rate]", ylab = "Precision [1 - False Discovery Rate]", main = paste("Precision-recall curve ", "[Probability of positive class", " > ", ranking.threshold, "]", sep=""), type='l', col = sample(1:10,1), lwd = 2)
				points(c(0,1), c(1,0), type='l', col='grey')
				abline(v = VP.new[length(pos.idx)+1], col='purple', lty = 3)
				abline(h = FP.new[length(pos.idx)+1], col='purple', lty = 3)
				#grid()
			}
		}
		else
		{
			return(list(cost = round((1-(n1+n2)/sum(pred.classes == positive))*100,2), threshold =  ranking.threshold, 
				accuracy = round(( 1 - (1 - F1.est)/conf.mat[nrow(conf.mat), ncol(conf.mat)])*100,2), 
				expected.matches = round(((n1 + n2)/sum(Y == positive))*100,2)))
		}
	}
}


myAUC <- function(pred.classes, Y, positive = 2, falseDiscoveryRate = FALSE)
{	
	pos.idx = which(pred.classes == positive)
	n_pos.idx = length(pos.idx)
	if(n_pos.idx == 0)
	{	
		cat("\nNo positive case found.\n")
		return(list(auc = 0, VP = 0, FP = 0))
	}
	else
	{
		conf.mat = confusion.matrix(pred.classes,Y)
		
		n2 = diag(conf.mat[,-ncol(conf.mat)])[nrow(conf.mat)] # true positives
		n1 = n_pos.idx - n2	# false positives
		n3 = conf.mat[1,2]	# false negatives
		n4 = conf.mat[1,1]  # true negatives
		
		inv_n23 = 1/(n2 + n3)
		inv_n12 = 1/(n1 + n2)
		#compute Area under Precision-Recall Curve
		if (falseDiscoveryRate) 
		{
			VP = array(0, n_pos.idx+1)
			FP = array(1, n_pos.idx+1)
			for(i in 1:n_pos.idx)
			{
				if (pred.classes[pos.idx[i]] == Y[pos.idx[i]] )  { VP[i+1] = VP[i] + inv_n23; FP[i+1] = FP[i]	}
				else {	FP[i+1] = FP[i] - inv_n12;  VP[i+1] = VP[i] }
			}
			FP.new = c(FP,0)
			VP.new = c(VP,1)
			
			n_pos = length(VP.new)
			auc = rep(0,n_pos)
			for (j in 2:n_pos)
			{	
				DV = (VP.new[j] - VP.new[j-1])
				DF = (FP.new[j-1] - FP.new[j] )
				#DF = (1-FP.new[j] - (1-FP.new[j-1]))
				auc[j] = { (1 -  VP.new[j])*DF + 0.5*DV*DF }
			}
			auc = 1- sum(auc)
		}
		# compute Area under ROC Curve
		else
		{
			# for ROC curve : VP rate = Sensitivity = VP/(VP + FN);  specificity = VN/(VN + FP) = 1 - FP rate 
			# and Sensitivity = True positives among all; Specificity = True negatives among all
			# rOC plots True positive rate (y-axis) vs false positive rate (x-axis)
			VP = FP = array(0, length(pos.idx)+1)
			inv_n14 = 1/(n1 + n4)
			for (i in 1:length(pos.idx))
			{
				if (pred.classes[pos.idx[i]] == Y[pos.idx[i]] ) {	VP[i+1] = VP[i] + inv_n23; FP[i+1] = FP[i]	}
				else {	FP[i+1] = FP[i] + inv_n14; VP[i+1] = VP[i] }
			}
			FP.new = c(FP,1)
			VP.new = c(VP,1)
			
			n_pos = length(VP.new)
			auc = rep(0,n_pos)
			for (j in 2:n_pos)
			{	
				DV = (VP.new[j] - VP.new[j-1])
				DF = (FP.new[j] - FP.new[j-1])
				auc[j] = (1 - VP.new[j])*DF + 0.5*DV*DF
			}
			auc = 1- sum(auc)
		}
		
		return(list(auc = auc, VP = VP.new, FP = FP.new))
	}
}


optimizeFalsePositives = function(X, Y, classes, o.positive = classes[2], o.ranking.values = 0, o.falseDiscoveryRate = TRUE, stepping = 0.01)
{
	if (o.falseDiscoveryRate)
	{ 	threshold = seq(0.51, 0.9, by = stepping)	}
	else
	{	 threshold = seq( median(o.ranking.values), max(o.ranking.values), by = 0.01)	}
	
	tmp.AUC = 0.01
	cost = accuracy = expected.matches = AUC = vector()
	
	i = 1; n = length(threshold)
	while( (tmp.AUC < 1) && (tmp.AUC > 0) && (i <= n))
	{
		ROC.object = roc.curve(X, Y, classes, positive = o.positive, ranking.threshold = threshold[i], ranking.values = o.ranking.values, falseDiscoveryRate = o.falseDiscoveryRate, plotting = FALSE)
		
		cost[i] = ROC.object$cost 
		accuracy[i] = ROC.object$accuracy 
		expected.matches[i] = ROC.object$expected.matches
		AUC[i] = ROC.object$AUC
		tmp.AUC = AUC[i]
		i = i + 1
	}
	threshold = threshold[1:(i-1)]
	
	plot(threshold, cost, type='l')
	points(threshold, accuracy, col='red', type='l')
	points(threshold, AUC*100, col='green', type='l')
	points(threshold, expected.matches, col='blue', type='l')
	grid()

	return(list(cost = cost, threshold = threshold,  accuracy = accuracy , expected.matches = expected.matches,  AUC = AUC))
}

someErrorType <- function(modelPrediction, Y, regression = FALSE)
{
	if (regression)
	{  
		n = length(Y)
		MSE <- L2Dist(Y, modelPrediction)/n
		return(
			list(
				error = MSE, 
				meanAbsResiduals = sum(abs(modelPrediction - Y))/n, 
				Residuals = summary(modelPrediction - Y), 
				percent.varExplained = max(0, 100*round(1 - MSE/var(Y),4)) 
			) 
		)
	}
	else
	{  
		confusion = confusion.matrix(modelPrediction,Y)
		if (length(unique(Y)) == 2 )
		{	
			return(list( error = generalization.error(confusion), confusion = confusion, 
			AUC = pROC::auc(Y, as.numeric(modelPrediction)), AUPR = myAUC(as.numeric(modelPrediction), as.numeric(Y), falseDiscoveryRate = TRUE)$auc) )
		}
		else
		{	return(list( error = generalization.error(confusion), confusion = confusion))	}
	}	
}

gMean <- function(confusionMatrix, precision = FALSE)  
{
	p = ncol(confusionMatrix)
	if (precision) 	{  acc = rmNA(diag(confusionMatrix[,-p])/rowSums(confusionMatrix[,-p])) }	
	else { 	acc = rmNA(diag(confusionMatrix[,-p])/colSums(confusionMatrix[,-p]))  }
	idx = which(acc == 0)
	if (length(idx) > 0) { acc = acc[-idx]	}
	nbAcc = length(acc)
	return( (prod(acc))^(1/nbAcc))
}

fScore <- function(confusionMatrix, Beta = 1) 
{
	confusionMatrix = confusionMatrix[,-ncol(confusionMatrix)]
	precision = confusionMatrix[2,2]/(confusionMatrix[2,1] +  confusionMatrix[2,2])
	recall = confusionMatrix[2,2]/(confusionMatrix[1,2] +  confusionMatrix[2,2])
	
	return( (1 + Beta^2)*(precision*recall)/(Beta^2*precision + recall) )
}

kappaStat <- function(confusionMatrix)
{
	n = sum(confusionMatrix[,-3])
	p1.all = sum(confusionMatrix[1,])/n; pall.1 = sum(confusionMatrix[,1])/n;
	p2.all = sum(confusionMatrix[2,])/n; pall.2 = sum(confusionMatrix[,2])/n; 
	Pr_a = sum(diag(confusionMatrix[,-3]))/n
	Pr_e = p1.all*pall.1 + p2.all*pall.2
	return((Pr_a - Pr_e )/(1 - Pr_e))
}

expectedSquaredBias <- function(Y, Ypred)  (mean(Y - mean(Ypred)))^2

randomWhichMax <- function(X)
{
    set.seed(sample(1e9,1))
	Y = seq_along(X)[X == max(X)]
    if (length(Y) == 1) { Y } else  { sample(Y,1) } 
}
	
# import : gtools (hence binsearch function) no more available for one linux distribution.
estimaterequiredSampleSize <- function(accuracy, dOfAcc) -log( dOfAcc/2)/(2*(1- accuracy)^2)  
estimatePredictionAccuracy <- function(n,  dOfAcc = 0.01)
{
	virtualRoot <- function(accuracy, n = n, dOfAcc = dOfAcc)  n - estimaterequiredSampleSize(accuracy, dOfAcc)
	accuracy <- mean(unlist(find.root(virtualRoot, 0.5, 1, n = n, dOfAcc = dOfAcc)))
	
	return(accuracy)
}

# subEstimaterequiredSampleSize <- function(accuracy, n) pbinom( floor(n*0.5)-floor(n*(1- accuracy)), size = floor(n), prob=0.5) 
# binomialrequiredSampleSize <- function(accuracy, dOfAcc) 
#{ 
  ##require(gtools)
	
  # r <- c(1,2*estimaterequiredSampleSize(accuracy, dOfAcc))
  # v <- binsearch( function(n) { subEstimaterequiredSampleSize(accuracy, n) - dOfAcc }, range = r, lower = min(r), upper = max(r))

  # return(v$where[[length(v$where)]])
# }
# End of import

setManyDatasets <- function(X, n.times, dimension = FALSE, replace = FALSE, subsample = FALSE, randomCut = FALSE)
{
	X.list = vector("list", n.times)
	if (dimension > 0)
	{  
		p = ncol(X)
		minDim = sqrt(p)
		if (dimension != 1) {  toss = dimension }
		for (i in 1:(n.times))
		{	
			if (dimension == 1) { toss = sample(minDim:p,1)/p }
			X.list[[i]] = sort(init_values(1:p, sample.size = toss, data.splitting = "train")$xtrain) 
		}
	}
	else
	{
		n = nrow(X)
		if (randomCut)	{ 	idx = sample(1:n, n) }
		else { idx = 1:n }
		cuts = cut(idx, n.times, labels = FALSE)
		n.times = 1:n.times			
		if (replace && !subsample) 
		{ 
			idx = sort(sample(idx, n, replace = replace))
			for (i in seq_along(n.times))
			{	X.list[[i]] = idx[which(cuts == i)] }
		}
		else
		{
			if (subsample != 0) 
			{ 
				idx = sort(sample(idx, floor(subsample*n), replace = replace))
				for (i in seq_along(n.times))
				{	X.list[[i]] = rmNA(idx[which(cuts == i)]) }
			}
			else
			{
				for (i in seq_along(n.times))
				{	X.list[[i]] = which(cuts == i) }
			}
		}		
	}
	
	return(X.list)
}

# simulation of random gaussian matrix which params come from an uniform distribution between [-10,10]
simulationData <- function(n, p, distrib = rnorm, colinearity = FALSE)
{
	X = matrix(NA, n, p)
	for (j in 1:p)
	{
		params = runif(1, -10, 10)
		X[,j] = distrib(n, params, sqrt(abs(params)))
	}
	X <- fillVariablesNames(X)
	
	return(X)
}

difflog <- function(X) 	diff(log(X))

rollApplyFunction <- function(Y, width, by = NULL, FUN = NULL)   
{  	
    if (is.null(by)) { by = width }	
		
	n = length(Y) 
	T = min(trunc(n/width),n)
	Z = NULL
	j = 1
	
	for(i in 1:T) 
	{	
		ZTmp = Y[j:(j + width - 1)]
		idxZ = seq(1, width, by = by)
		Z = c(Z,FUN(ZTmp[idxZ]))
		j = j + width 
	} 
	
	return( na.omit(Z))
}

lagFunction <- function(X, lag = 1, FUN = NULL, inRange= FALSE)
{
	if (lag == 1) { return(FUN(X))	}
	else
	{
		idx = 1:length(X)
		lagIdx = idx + lag
       
		limitIdx = which(lagIdx == max(idx))
	   
		XIdx = cbind(lagIdx[1:limitIdx],idx[1:limitIdx])
	   
		if (inRange) {  return(apply(cbind(XIdx[,2], XIdx[,1]),1, function(Z)  FUN(X[Z[1]:Z[2]]) )) }		
	   else { return(apply(cbind(X[XIdx[,2]], X[XIdx[,1]]),1, function(Z)  FUN(Z) ))     } 
	   
	}
}

L2Dist <- function(Y,Y_)  sum( (Y - Y_)*(Y - Y_) )

L1Dist <- function(Y,Y_) sum(abs(Y - Y_))

L2.logDist <- function(Y,Y_) sum( (log(Y) - log(Y_))^2 )

HuberDist <- function(Y, Y_, a = abs(median(Y - Y_)))  sum(ifelse( abs(Y - Y_) <= a, (Y - Y_)^2, 2*a*abs(Y - Y_)- a^2))

pseudoHuberDist <- function(Y, Y_, a = abs(median(Y - Y_))) sum(a^2*(sqrt(1 + ((Y - Y_)/a)^2) - 1))

# ndcg <- function(estimatedRelevanceScoresNames, trueRelevanceScoresNames, predictionsMatrix, idx = 1:nrow(predictionsMatrix)) 
# {
	# scores = sortMatrix(predictionsMatrix[idx,], trueRelevanceScoresNames, decrease = TRUE)
	# estimatedRelevanceRanks = sortMatrix(predictionsMatrix[idx,], "rank")
	# trueRelevanceScores = scores[,trueRelevanceScoresNames]
	# scoreIdx = match(scores[,estimatedRelevanceScoresNames], estimatedRelevanceRanks[,estimatedRelevanceScoresNames]) 
	# estimatedRelevanceScores = estimatedRelevanceRanks[ scoreIdx,"majority vote"]
	# zeros = which(trueRelevanceScores == 0)
	# if (length(zeros) > 0)
	# { estimatedRelevanceScores[zeros] = 0 }
	# estimatedRelevanceScores = sortMatrix(cbind(scoreIdx, estimatedRelevanceScores),1)[,2]
  
	# limitOverEstimation = which(trueRelevanceScores > 0)
	# if (length(limitOverEstimation) > 0)
	# {	
		# estimatedRelevanceScores[scoreIdx[limitOverEstimation]] = min(estimatedRelevanceScores[scoreIdx[limitOverEstimation]], 
		# trueRelevanceScores[limitOverEstimation])
	# }
     
	# DCG <- function(y) sum( (2^y - 1)/log(1:length(y) + 1, base = 2) ) #y[1] + sum(y[-1]/log(2:length(y), base = 2))
 
	# DCGTrueScore = DCG(trueRelevanceScores)
	# DCGScore = DCG(estimatedRelevanceScores)
	# if (DCGTrueScore == 0)
	# { return (-1)  }
	# else
	# {	 return(DCGScore/DCGTrueScore) }
# }

# fullNDCG <- function(estimatedRelevanceScoresNames, trueRelevanceScoresNames, predictionsMatrix, idx = 1:nrow(predictionsMatrix), filterNoOrder = TRUE)
# {
	# if (is.null(idx))
	# { stop("Please use ndcg() function. fullNDCG needs index of ID to rank scores.") }
	
	# cases = sort(unique(idx))
	# scores = vector(length = length(cases))
	# for (i in 1:length(cases))
	# {
		# subCases = which(idx == cases[i])
		# scores[i] <- ndcg(estimatedRelevanceScoresNames, trueRelevanceScoresNames, predictionsMatrix, idx = subCases)
	# }

	# if (filterNoOrder)
	# { scores[scores == -1] == 1	}
	
	# return(scores)
# }
	
randomize = function(X)  sample(1:nrow(X), nrow(X))

# Bayesian bootstrap
# sampleDirichlet <- function(X)
# {
	# gamma.sample = rgamma(ncol(X),1)
	# dirichlet.sample = gamma.sample/sum(gamma.sample) 
	# R.x =  apply(X, 2,  function(Z) c(max(Z),min(Z)))
	
	# return(-diff(R.x)*dirichlet.sample + R.x[2,])
# }

#some colors
#require(grDevices)
# palet <- colorRampPalette(c("yellow", "red"))

bCICore <- function(X, f = mean, R = 100, conf = 0.95, largeValues = TRUE, skew = TRUE)
{
	XX = X
	bootstrapf = rep(0,R)
	X = unique(X)
	nn = length(X)
	
	replaceWithSeed <- function(f, X, n)
	{
		set.seed(sample(2*R*length(XX),1))
		f(sample(X, n, replace = FALSE))
	}
	
	bootstrapf <- replicate(R, replaceWithSeed(f, XX, nn)) 

	alpha1 = (1 - conf)/2
	alpha2 = conf + alpha1
	if (largeValues) 
	{ 
		if (!skew) {	return(c(mean(bootstrapf), quantile(bootstrapf, alpha1) + qnorm(alpha1)*sd(X)/sqrt(nn), quantile(bootstrapf, alpha2) + qnorm(alpha2)*sd(X)/sqrt(nn) )) }
		else 
		{ 
			return(c(mean(bootstrapf), quantile(bootstrapf, alpha1) - sd(X), quantile(bootstrapf, alpha2)+ sd(X)))	
		}		
	}
	else  { return(c(mean(bootstrapf), quantile(bootstrapf, alpha1), quantile(bootstrapf, alpha2) )) }
}


bCI <- function(X, f = mean, R = 100, conf = 0.95, largeValues = TRUE, skew = TRUE, threads = "auto")
{
	if (is.matrix(X)) { n = nrow(X) } else { n =length(X) }
	
	{
		max_threads = detectCores()
		
		if (threads == "auto")
		{	
			if (max_threads == 2) { threads = max_threads }
			else {	threads  = max(1, max_threads - 1)  }
		}
		else
		{
			if (max_threads < threads) 
			{	cat("Warning : number of threads is higher than logical threads in this computer.\n") }
		}
		
		{
			Cl = makePSOCKcluster(threads, type = "SOCK")
			registerDoParallel(Cl)
		}
		chunkSize  <-  ceiling(n/getDoParWorkers())
		smpopts  <- list(chunkSize = chunkSize)
		
		export = c("bCICore", "rmNA", "rmInf")
	}
	
	
	if (is.vector(X)) { return(bCICore(X, f = f, R = R, conf = conf)) }
	else
	{
		i = NULL
		if (is.data.frame(X))  { X <- factor2matrix(X) }
		
		if (length(which.is.na(X)) > 0)
		{	
			bciObject <- foreach(i = 1:n, .export = export, .options.smp = smpopts, .combine = cbind, .multicombine = TRUE) %dopar%
			bCICore(rmNA(rmInf(X[i,])), f = f, R = R, conf = conf, largeValues = largeValues, skew = skew)
		}
		else		
		{	
			bciObject <- foreach(i = 1:n, .export = export, .options.smp = smpopts, .combine = cbind, .multicombine = TRUE) %dopar%
			bCICore(rmInf(X[i,]),  f = f, R = R, conf = conf, largeValues = largeValues, skew = skew)
		}
		
		stopCluster(Cl)
		bciObject <- t(bciObject)
		colnames(bciObject)[1] = "Estimate"
		return(bciObject)
	}
}


OOBquantiles <- function(object, conf = 0.95, plotting = TRUE, gap = 10, outliersFilter = TRUE, threshold = NULL, thresholdDirection = c("low", "high"))
{
	object = filter.object(object)
	if (is.null(object$forest$OOB.votes)) { stop ("no OOB data. Please enable OOB option when computing randomUniformForest() function") }
	
	X = object$forest$OOB.votes
	B =  ncol(X)
	alpha1 = (1 - conf)/2
	alpha2 = conf + alpha1
	
	Q_1 = apply(X, 1, function(Z) quantile(unique(rmInf(Z)), alpha1))
	Q_2 = apply(X, 1, function(Z) quantile(unique(rmInf(Z)), alpha2))
	SD = apply(X, 1, function(Z) sd(rmInf(Z)))
	predictedObject = data.frame(cbind(object$forest$OOB.predicts, Q_1, Q_2, SD))
	Q1Name = paste("LowerBound", "(q = ", round(alpha1,3), ")", sep ="")
	Q2Name = paste("UpperBound", "(q = ", round(alpha2,3), ")",sep ="")
	colnames(predictedObject) = c("Estimate", Q1Name, Q2Name, "standard deviation")
	
	OOsampleUpper = sum(predictedObject[,3] < object$y)/length(object$y)
	OOsampleLower = sum(predictedObject[,2] > object$y)/length(object$y)
	
	# plot quantiles
	if (plotting == TRUE)
	{
		#require(ggplot2)
		predictedObject2 =  predictedObject
		Y = object$y
		predictedObject2 =  cbind(Y,predictedObject2)
		
		if (!is.null(threshold))
		{ 
			if (thresholdDirection == "low") { 	idx = which(Y <= threshold) }
			else {  idx = which(Y > threshold)  }
			predictedObject2 = predictedObject2[idx, ]
			
			if (length(Y) < 1) { stop("no obervations found using this threshold.") }
		}
		
		if (outliersFilter)
		{
			highOutlierIdx =  which(Y > quantile(Y,0.95))
			lowOutlierIdx =  which(Y < quantile(Y,0.05))
			if (length(highOutlierIdx) > 0 || length(lowOutlierIdx) > 0) 
			{	predictedObject2 = predictedObject2[-c(lowOutlierIdx,highOutlierIdx),]	}
		}		
		predictedObject2 = predictedObject2[seq(1, nrow(predictedObject2), by = gap), ]
		colnames(predictedObject2) = c("Y", "Estimate", "LowerBound", "UpperBound", "standard deviation")
		predictedObject2 = sortMatrix(predictedObject2, 1)
		
		Estimate = UpperBound = LowerBound = NULL
		
		tt <- ggplot(predictedObject2, aes(x = Estimate, y = Y))
			
		plot(tt +  geom_point(size = 3, colour = "lightblue") + geom_errorbar(aes(ymax = UpperBound, ymin = LowerBound)) 
			+ labs(title = "", x = "Estimate (with confidence Interval)", y = "True responses (Y)") )
	}
	
	cat("Probability of being over upper bound (", alpha2*100, "%) of the confidence level: ", round(OOsampleUpper,3) , "\n", sep="")
	cat("Probability of being under the lowest bound (", alpha1*100, "%) of the confidence level: ", round(OOsampleLower,3) , "\n", sep ="")
		
	return(predictedObject)
}

outsideConfIntLevels <- function(predictedObject, Y, conf = 0.95)
{
	alpha1 = (1 - conf)/2
	alpha2 = conf + alpha1
	OOsampleUpper = sum(predictedObject[,3] < Y)/length(Y)
	OOsampleLower = sum(predictedObject[,2] > Y)/length(Y)

	cat("Probability of being over upper bound (", alpha2*100, "%) of the confidence level: ", round(OOsampleUpper,3) , "\n", sep="")
	cat("Probability of being under the lowest bound (", alpha1*100, "%) of the confidence level: ", round(OOsampleLower,3) , "\n", sep ="")
}

scale2AnyValues <- function(X, Y)
{
	Z = vector(length = length(X))
	XX = standardize(X)
	Z = quantile(Y, XX)
	return(Z)
}

perspWithcol <- function(x, y, z, pal, nbColors,...,xlg = TRUE, ylg = TRUE)
{
	# import from http://rtricks.wordpress.com/tag/persp/
	colnames(z) <- y
	rownames(z) <- x

	nrz <- nrow(z)
	ncz <- ncol(z) 

	color <- sort(pal(nbColors), decreasing= TRUE)
	zfacet <- z[-1, -1] + z[-1, -ncz] + z[-nrz, -1] + z[-nrz, -ncz]
	facetcol <- cut(zfacet, nbColors)
	par(xlog=xlg,ylog=ylg)
	persp(
		as.numeric(rownames(z)),
		as.numeric(colnames(z)),
		as.matrix(z),
		col = color[facetcol],
		...
	)
}

biasVarCov <- function(predictions, target, regression = FALSE, idx = 1:length(target))
{
	if (is.factor(target)) 
	{ 
		target = as.numeric(target) 
		if (length(unique(target)) != 2) { stop("Decomposition currently works only for binary classification.") }
	}
	if (is.factor(predictions)) { predictions = as.numeric(predictions) }
	
	noise = ifelse(length(idx) > 1, var(target[idx]), (target[idx] - mean(target))^2)
	squaredBias = ifelse(length(idx) > 1,(mean(predictions[idx]) - mean(target[idx]))^2,(mean(predictions[idx]) - target[idx])^2) 
	predictionsVar = ifelse(length(idx) > 1, var(predictions[idx]), (predictions[idx] - mean(predictions))^2)
	predictionsTargetCov = ifelse(length(idx) > 1,
	cov( cbind(predictions[idx], target[idx]) )[1,2], 
	mean( (predictions[idx] - mean(predictions) ) *(target[idx] - mean(target))))
	
	MSE = noise + squaredBias + predictionsVar - 2*predictionsTargetCov
	
	cat("Noise: ",noise,"\n", sep ="")
	cat("Squared bias: ", squaredBias,"\n", sep ="")
	cat("Variance of estimator: ",  predictionsVar,"\n", sep ="")
	cat("Covariance of estimator and target: ",  predictionsTargetCov, "\n", sep ="")
	
	if (regression)
	{
		object = list(MSE = MSE , squaredBias = squaredBias, predictionsVar = predictionsVar, predictionsTargetCov = predictionsTargetCov)
		cat("\nMSE(Y, Yhat) = Noise(Y) + squaredBias(Y, Yhat) + Var(Yhat) - 2*Cov(Y, Yhat) = ",  MSE, "\n", sep ="")
	}
	else
	{
		object = list(predError = MSE , squaredBias = squaredBias, predictionsVar = predictionsVar, predictionsTargetCov = predictionsTargetCov)
		cat("\nAssuming binary classification with classes {0,1}, where '0' is the majority class.")
		cat("\nMisclassification rate = P(Y = 1)P(Y = 0) + {P(Y = 1) - P(Y_hat = 1)}^2 + P(Y_hat = 0)P(Y_hat = 1) - 2*Cov(Y, Y_hat)")
		cat("\nMisclassification rate = P(Y = 1) + P(Y_hat = 1) - 2*E(Y*Y_hat) = ",  MSE, "\n", sep ="")
	}
	
	return(object)
}
	
dates2numeric <- function(X, whichCol = NULL, inverseDate = FALSE)
{
	lengthChar = nchar(as.character((X[1,whichCol])))

	year = if (inverseDate) { as.numeric(substr(X[,whichCol], 7, 10)) } else { as.numeric(substr(X[,whichCol], 1,4)) }
	month = if (inverseDate) {  as.numeric(substr(X[,whichCol], 4, 5)) } else { as.numeric(substr(X[,whichCol], 6,7)) }
	day =  if (inverseDate) {  as.numeric(substr(X[,whichCol], 1, 2)) } else { as.numeric(substr(X[,whichCol], 9, 10)) }
	
	if (lengthChar > 10) 
	{	
		hour = as.numeric(substr(X[,whichCol], 12,13))	
		if (lengthChar > 13) { minute = as.numeric(substr(X[,whichCol], 15, 16))	}
		else { minute = NULL } 
	}
	else { hour = NULL }
	
	if (is.null(hour)) 	{	return(cbind(year, month, day, X[,-whichCol]))	}
	else {  return(cbind(year, month, day, hour, minute, X[,-whichCol]))	}
	
}	

predictionvsResponses <- function(predictions, responses, plotting = TRUE)
{
	linMod = lm(responses ~ predictions)
	print(summary(linMod))
	if (plotting)
	{
		plot(predictions, responses,  xlab = "Predictions", ylab = "Responses", lwd = 2.5)
		abline(a = linMod$coef[[1]] , b = linMod$coef[[2]], col="green",lwd = 2)
		grid()
	}	
}

rm.tempdir <- function()
{
	path = getwd()
	setwd(tempdir())
	listFiles <- list.files()[grep("file", list.files())]
	if (length(listFiles) > 0) {  file.remove(listFiles) }
	setwd(path)
}


model.stats <- function(predictions, responses, regression = FALSE, OOB = FALSE, plotting = TRUE)
{
    if (inherits(predictions, "randomUniformForest"))  # class(predictions) == "randomUniformForest"
	{
		if (OOB && is.null(predictions$forest$OOB.predicts)) { stop("no OOB predictions in the model.") }
		object = predictions
		if (OOB) { 	predictions = predictions$forest$OOB.predicts }
		else { predictions = predictions$predictionObject$majority.vote }
		
		if (!regression) 
		{
			predictions = as.factor(predictions)
			levels(predictions) = object$classes
		}
	}
	graphics.off()
	#require(pROC)
	if (OOB)  { cat("\nOOB evaluation") }
	else { cat("\nTest set") }
	if (is.factor(predictions) || (!regression))
	{
		uniqueResponses = sort(unique(responses))
		if (!is.factor(responses)) { responses = as.factor(responses) }
		confMat = confusion.matrix(predictions, responses)
		#rownames(confMat) = uniqueResponses
		#colnames(confMat)[-ncol(confMat)] = rownames(confMat)
		cat("\nError rate: ")
		testError = generalization.error(confMat)
		cat(round(100*testError, 2), "%\n", sep="")
		cat("\nConfusion matrix:\n")
		print(round(confMat, 4))
		
		if (length(uniqueResponses) == 2)
		{
			AUC = pROC::auc(as.numeric(responses), as.numeric(predictions))
			AUPR = myAUC(as.numeric(predictions), as.numeric(responses), falseDiscoveryRate = TRUE)$auc
			cat("\nArea Under ROC Curve:", round(AUC, 4))
			cat("\nArea Under Precision-Recall Curve:", 
			round(AUPR, 4))
			cat("\nF1-score:", round(fScore(confMat),4))
			if (plotting)
			{
				roc.curve(predictions, responses, uniqueResponses, printValues = FALSE)
				dev.new( )
				roc.curve(predictions, responses, uniqueResponses, falseDiscoveryRate = TRUE, printValues = FALSE)
			}
		}
		cat("\nGeometric mean:", round(gMean(confMat),4),"\n")
		if (nrow(confMat) > 2)
		{ cat("Geometric mean of the precision:", round(gMean(confMat, precision = TRUE),4),"\n") }
		else
		{ cat("Kappa statistic:", round(kappaStat(confMat), 4),"\n")	}
		
		return(list(confusionMatrix = confMat, testError = testError, AUC = if (length(uniqueResponses) == 2) { AUC } else { "none" }, 
		AUPR = if (length(uniqueResponses) == 2) { AUPR } else { "none" }))
	}
	else
	{
		n = length(responses)
		MSE = L2Dist(predictions, responses)/n
		cat("\nMean of squared residuals: ", round(MSE, 6), "\n", sep="") 
		cat("Variance explained: ", round(100*(1 - MSE/var(responses)),2), "%\n\n", sep = "")	
		Residuals <- summary(responses - predictions)
		names(Residuals) = c("Min", "1Q", "Median", "Mean", "3Q", "Max")
		cat("Residuals:\n")
		print(Residuals)
		cat("Mean of absolute residuals: ", round(L1Dist(predictions, responses)/n, 6), "\n", sep="")
		cat("\nLinear modelling:")
		predictionvsResponsesLinMod = predictionvsResponses(predictions, responses, plotting = plotting)
		dev.new()
		plot(responses,responses - predictions, xlab = "Responses", ylab = "Residuals", lwd = 2.5)
		abline(h = 0, col = "green", lwd = 2)
		grid()
		dev.new()
		plot(density(responses - predictions), lwd = 3, main = "Density of residuals")
		cat("Please use R menu to vertically tile windows and see all plots.\n")
		outputLM = summary(predictionvsResponsesLinMod)
		
		return(list(MSE = MSE, Residuals = Residuals))
	}
}


generic.cv <- function(X, Y, nTimes = 1, k = 10, seed = 2014, regression = TRUE, genericAlgo = NULL, specificPredictFunction = NULL, 
metrics = c("none", "AUC", "precision", "F-score", "L1", "geometric mean", "geometric mean (precision)"))
{
	set.seed(seed)
	s = 1
	testError = metricValues = vector()
	n = nrow(X)
	for (i in 1:nTimes)
	{
		# randomize data
		train_test = sample(n, n)
		# divide data in k folds
		foldsIdx = cut(train_test, k, labels = FALSE)
		
		# for each fold (the number 'j'), choose it as a test set 
		# and choose all the others as the training se
		for (j in 1:k)
		{
			# all folds minus the j-th
			trainIdx = which(foldsIdx != j)			
			X1 = X[trainIdx,]
			Y1 = Y[trainIdx]
			
			testIdx = which(foldsIdx == j)
			X2 = X[testIdx,]
			Y2 = Y[testIdx]
									 
			# train and test samples are always converted to an R matrix
			# if classification a factor will be needed
			if (!regression) { Y1 = as.factor(Y1) }
			
			# R fill find genericAlgo() from the global environment
			if (is.null(genericAlgo))
			{ 
				stop("Please provide a wrapper for the algorithm needed to be assessed. For example, if randomForest is needed\n just write : 'genericAlgo <- function(X, Y) randomForest(X, Y)'\n Then, use the option 'genericAlgo = genericAlgo' in generic.cv() function\n")
			}
			else
			{	model = genericAlgo(X1, Y1)	}
			
			# some algorithms like gbm or glmnet require specific arguments
			# for the predict function. One has to create 
			# a specific predict function for them
			if (is.null(specificPredictFunction))
			{ 	predictions = predict(model, X2)	}
			else
			{  predictions = try(specificPredictFunction(model, X2)) }
			
			if (regression)
			{	
				testError[s] = sum((predictions - Y2)^2)/length(Y2)	
			
				if (metrics[1] == "L1") {  metricValues[s] =  sum(abs(predictions - Y2))/length(Y2)	}
			
			}
			else
			{	
				confMat = confusion.matrix(predictions, Y2)
				testError[s] = generalization.error(confMat)
				if (length(unique(Y2)) == 2)
				{	
					if (metrics[1] == "AUC") {  metricValues[s] = pROC::auc(as.numeric(Y2), as.numeric(predictions)) }				
					if (metrics[1] == "precision") 	{ metricValues[s] = confMat[2,2]/sum(confMat[2,]) }				
					if (metrics[1] == "F-score")  	{ metricValues[s] = round(fScore(confMat),4) }
				}
				else 
				{  
					if (metrics[1] == "geometric mean") {   metricValues[s] = round(gMean(confMat),4)	}
					if (metrics[1] == "geometric mean (precision)") {   metricValues[s] = round(gMean(confMat, precision = TRUE),4)	}
				}
			}
			s = s + 1
		}
	}
	
	if (metrics[1] == "none") {  return(list(testError = testError, avgError = mean(testError), stdDev = sd(testError)) ) }
	else {  return(list(testError = testError, avgError = mean(testError), stdDev = sd(testError), metric = metricValues) ) }
}

filterOutliers <- function(X, quantileValue, direction = c("low", "high")) 
if (direction == "low") { X[X < quantileValue] } else { X[X > quantileValue]  }

which.is.nearestCenter <- function(X, Y)
{
	n = nrow(X)
	K = nrow(Y)
	distToY = matrix(NA, n, K)
	for (k in 1:K)	{ distToY[,k] = apply(X, 1, function(Z) L2Dist(Z, Y[k,])) }	
	return(apply(distToY, 1, which.min))
}

variance <- function(X)  (length(X)-1)/length(X)*var(X) 
 
interClassesVariance <- function(X, classes) 
{        
	m <- tapply(X, classes, mean) 
	l <- tapply(X, classes, length)
	return( sum(l* ( m - mean(X) )^2/length(X)) )
} 
 
intraClassesVariance <- function(X, classes) 
{        
	v <- tapply(X, classes, variance) 
	v[is.na(v)] <- 0 
	l <- tapply(X, classes,length) 
	return( sum(l*v/length(X)) )
}
 
mergeOutliers <- function(object)
{
	Z = object$unsupervisedModel$cluster
	if (!is.null(object$unsupervisedModel$clusterOutliers))
	{ Z = c(Z, object$unsupervisedModel$clusterOutliers)	}			
	if (!is.null(names(Z))) 
	{ 
		Z = sortDataframe( data.frame(Z, as.numeric(names(Z))), 2)
		return(Z[,1])
	}
	else
	{	return(Z) }	
}

splitVarCore <- function(X, nsplit, lengthOfEachSplit)
{
	X = as.character(X)
	charLength = nchar(X[1])
	XX = matrix(NA, length(X), nsplit)
	k = 1
	for (j in 1:nsplit)
	{
		for (i in 1:length(X))
		{	XX[i,j] = substr(X[i], k,(k + lengthOfEachSplit[j] - 1)) }
		k = k + lengthOfEachSplit[j] 	
	}

	XX  = fillVariablesNames(XX)
	return(XX)
}

timeStampCore <- function(stamp, n = NULL, begin = 0, end = NULL, windowLength = NULL)
{
	if (is.null(n)) { stop("Please provide 'n', the length of the sample.\n") }
	Z = vector(length = n)
	if (!is.null(end))
	{ Z = seq(begin, end, by = stamp)   }
	else
	{	
		if (!is.null(windowLength))
		{
			K = floor(windowLength/stamp)
			s = 1
			Z[1] = begin
			Z[1:(s + K-1)] = cumsum(c(Z[1], rep(stamp, K-1)))
			s = s + K 
			for (k in 2:(floor(n/K)))
			{
				Z[s:(s + K-1)] = cumsum(rep(stamp, K))			
				s = s + K 
			}
			if (s < n)
			{	
				gapLength = n - s + 1
				Z[s:n] = cumsum(rep(stamp, gapLength))				
			}
		}
		else
		{ stop("Please provide either a 'end' value or a 'windowLength' one.\n") }
	}
	return(Z)	
}

modelingResiduals <- function(object, X, Y, xtest = NULL, f = randomUniformForest, ...)
{
	if ((inherits(object, "randomUniformForest")[1]) || (inherits(object,"randomUniformForest.formula")[1]))
	# ((class(object)[1] == "randomUniformForest") || (class(object)[1] == "randomUniformForest.formula"))
	{
		if (is.null(object$forest$OOB.predicts)) { stop("OOB predictions are needed.") }
		if (!object$forest$regression) { stop("The function works only for regression.") }
		residualsOfObject = object$forest$OOB.predicts - Y
	}
	else
	{
		if (is.null(object$predicted)) { stop("OOB predictions are needed.") }
		if (object$type != "regression") { stop("The function works only for regression.") }
		residualsOfObject = object$predicted - Y
	}
	
	objectForResiduals = f(X, residualsOfObject, xtest = xtest, ...)

	return(objectForResiduals)
}

reSMOTE <- function(X, Y, majorityClass = 0, increaseMinorityClassTo = NULL, 
conditionalTo = NULL,
samplingFromGaussian = FALSE,  
samplingMethod = c("uniform univariate sampling", "uniform multivariate sampling", "with bootstrap"),
maxClasses = max(10, which.is.factor(X[,, drop = FALSE], count = TRUE)), 
seed = 2014)
{
	n = nrow(X)
	flag = FALSE
	if (is.data.frame(X)) 
	{
		factorsIdx = which.is.factor(X, maxClasses = maxClasses)
		XX = X
		X = factor2matrix(X)
		flag = TRUE
	}
	if (n != length(Y)) stop("Data and responses don't have same size.\n")
	
	majorityIdx = NULL
	for (i in seq_along(majorityClass))	{ 	majorityIdx = c(majorityIdx, which(Y == majorityClass[i])) }
	percentMinority = 1 - length(majorityIdx)/n
	minorityIdx = (1:n)[-majorityIdx]
	
	if (is.null(increaseMinorityClassTo)) 
	{ 
		increaseMinorityClassTo = min(2*percentMinority, 0.5)
		cat("Argument 'increaseMinorityClassTo' has not been set.\nOriginal minority class percentage has been raised by a factor 2\n.")
	}
	
	iterations = round(increaseMinorityClassTo/percentMinority, 0)
	
	if (iterations < 1) stop("'increaseMinorityClassTo' must be increased.\n")
	
	if (samplingMethod[1] == "with bootstrap")
	{ 
		cat("'with bootstrap' is only needed as the second argument of 'method' option. Default option will be computed.\n")
		samplingMethod[1] = "uniform univariate sampling"
	}
	
	XXY = vector('list', iterations)
	for (i in 1:iterations)
	{
		XXY[[i]] <- unsupervised2supervised(X, method = samplingMethod[1], conditionalTo = conditionalTo, samplingFromGaussian = samplingFromGaussian, seed = seed,
		bootstrap = if (length(samplingMethod) > 1) { TRUE } else { FALSE })$X[(n+1):(2*n),][minorityIdx,]
	}

	newX = do.call(rbind, XXY)
	rm(XXY)
	newXY = cbind(newX, rep(Y[minorityIdx], iterations))
	p = ncol(newXY)
	colnames(newXY)[p] = "Value"
	if (!is.numeric(Y))
	{	
		levelsY = levels(Y)
		minorityClassIdx = which(levelsY != as.character(majorityClass))
		currentValuesOfY = as.factor(newXY[,p])
		levels(currentValuesOfY) = levelsY[minorityClassIdx]
		newXY = as.data.frame(newXY)
		newXY[,p] = as.factor(currentValuesOfY)
		colnames(newXY)[p] = "Class"
		XY = cbind(XX,Y)
		colnames(XY)[p] = "Class"
		
		if (flag) 
		{
			for (j in 1:(p-1))
			{
				if (is.factor(XX[,j]))
				{
					uniqueValues = sort(unique(newXY[,j]))
					newXY[,j] = as.factor(newXY[,j])
					levels(newXY[,j]) = levels(XX[,j])[uniqueValues]
				}			
			}		
		}
		
		ZY = rbind(XY, newXY)
	}
	else
	{
		XY = cbind(X,Y)
		colnames(XY)[p] = "Value"
		if (flag) 
		{
			for (j in 1:(p-1))
			{
				if (is.factor(XX[,j]))
				{
					uniqueValues = sort(unique(newXY[,j]))
					newXY[,j] = as.factor(newXY[,j])
					levels(newXY[,j]) = levels(XX[,j])[uniqueValues]
				}			
			}		
		}
		ZY  = rbind(XY, newXY)	
	}
	cat("Proportion of examples of minority class in the original sample: ", round(100*(length(minorityIdx))/n, 2), "%.\n", sep="")
	cat("Proportion of examples of minority class in the new sample: ", round(100*length(which(ZY[,"Class"] != majorityClass))/nrow(ZY), 2), "%.\n", sep="")
	cat(nrow(ZY), "instances in the new sample.\n")
	
	return(ZY)
}

OOBVotesScale <- function(X) t(apply(X, 1, function(Z) 1/sum(Z)*Z))

# import 
subsampleFile <- function(readFile = infile, writeFile = outfile, sample.size = NULL, header = TRUE) 
{
	# modified code from original function of Norman Matloff.
	# original code : http://heather.cs.ucdavis.edu/Thinning.R
	if (is.null(readFile)) { stop("Please provide the name of the file to read.") }
	else  { infile = readFile }
	   
	if (is.null(writeFile)) { stop("Please provide the name of the file to write.") }
	else  { outfile = writeFile }

	if (is.null(sample.size)) { stop("Please provide the percentage of samples.") }
	else  
	{ 	k = round(1/sample.size, 0)	}  
    
	ci <- file(infile, "r")
    co <- file(outfile, "w")
   
    if (header) 
    {
		hdr <- readLines(ci, n = 1)
		writeLines(hdr, co)
    }
	
	recnum = 0
	numout = 0
   
	while (TRUE) 
	{
		inrec <- readLines(ci, n = 1)
		if (length(inrec) == 0) 
		{  # end of file?
			close(co) 
			cat("Number of lines sampled:\n")
			return(numout)
		}
		recnum <- recnum + 1
		if (recnum %% k == 0) 
		{
			numout <- numout + 1
			writeLines(inrec, co)
		}
   	}
}

copulaLike <- function(Xtest, importanceObject, whichFeatures = NULL, 
whichOrder = c("first", "second", "all"), 
maxClasses = max(10, which.is.factor(Xtest[,, drop = FALSE], count = TRUE)),
outliersFilter = FALSE,
bg = "grey")
{
	if (is.null(whichFeatures) || (whichFeatures[1] == "all"))
	{	
		features = 1:ncol(Xtest)
		featuresNames = colnames(Xtest) 
	}
	p = length(features)
	pD = vector('list', p)
	featuresIdx = idx = NULL
	
	if (is.character(features[1]))
	{
		featuresNames = features
		newFeatures = vector(length = p)
		for (k in 1:p)	{	newFeatures[k] = which(colnames(Xtest) == features[k])	}
		features = newFeatures
	}
	else
	{	featuresNames = colnames(Xtest)[newFeatures] }
	
	newXtest = factor2matrix(Xtest)
	for (j in 1:p)
	{
		pD[[j]] <- partialDependenceOverResponses(newXtest, importanceObject, whichFeature = features[j], 
		whichOrder = whichOrder, outliersFilter = outliersFilter, plotting = FALSE, 
		followIdx = TRUE, maxClasses = maxClasses)
		featuresIdx = c(featuresIdx, rep(features[j], nrow(pD[[j]]$partialDependenceMatrix)))
		idx = c(idx, pD[[j]]$idx)
		pD[[j]] = pD[[j]]$partialDependenceMatrix
	}

	bigDependenceMatrix = do.call(rbind, pD)
	bigDependenceMatrix = cbind(bigDependenceMatrix, featuresIdx, idx)
			
	uniqueIdx = unique(bigDependenceMatrix[,4])
	n = length(uniqueIdx)
	newBigDependenceMatrix = matrix(NA, n, p+1)
	
	if (is.numeric(bigDependenceMatrix[, 2])) {	f = mean } 
	else  {  f = function(X) names(which.max(table((X)))); flag = TRUE }
	
	for (i in 1:n)
	{
		newIdx = which(bigDependenceMatrix[,4] == uniqueIdx[i])
		newBigDependenceMatrix[i,1] = f(bigDependenceMatrix[newIdx, 2])
		newBigDependenceMatrix[i, 1 + bigDependenceMatrix[newIdx,3]] =  bigDependenceMatrix[newIdx, 1]
	}
	
	colnames(newBigDependenceMatrix) = c("predictions", featuresNames)
	rownames(newBigDependenceMatrix) = uniqueIdx
	
	factorIdx = which.is.factor(Xtest, maxClasses = maxClasses)
	if (any(factorIdx == TRUE))
	{ 
		newBigDependenceMatrix = as.data.frame(newBigDependenceMatrix)
		matchFactorIdx = rmNA(match(which(factorIdx == 1), features))
		if (length(matchFactorIdx) > 0)
		{
			for (i in 1:length(matchFactorIdx))
			{
				newBigDependenceMatrix[,-1][,features[matchFactorIdx][i]] = 
				as.factor(newBigDependenceMatrix[,-1][,features[matchFactorIdx][i]])
				levels(newBigDependenceMatrix[,-1][,features[matchFactorIdx][i]]) = 
				levels(Xtest[, features[matchFactorIdx][i]])	
			}
		}
	}
	
	if (flag)  
	{ 
		rmClassPrefix = as.character(newBigDependenceMatrix[,1]) 
		trueClasses = as.factor(rm.string(rmClassPrefix, "Class "))
		newBigDependenceMatrix[,1] = trueClasses
	}
	
	return(newBigDependenceMatrix)
}
# END OF FILE

Try the randomUniformForest package in your browser

Any scripts or data that you put into this service are public.

randomUniformForest documentation built on June 22, 2022, 1:05 a.m.