funcly: Useful Functions in R for All-Purpose Analysis

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# authors: Alireza Hosseini, Reza Hosseini

## a function which partitions the data into two parts
DivideData = function(df, m=NA) {
	n = dim(df)[1]
	if (is.na(m)) {
		m = round(n/2)
	}
	ind1 = sample(1:n, m)
	ind2 = setdiff(1:n, ind1)
	df1 = df[ind1, ]
	df2 = df[ind2, ]
	outList = list(df1=df1, df2=df2)
	return(outList)
}

DivideDataWrt = function(df, prop=1/2, colNames) {
	
	l = list()
	for (i in 1:length(colNames)) {
		l[[i]] = df[ , colNames[i]]
	}

	dataList = split(df, l)
	n = length(dataList)
	m = max(round(n*prop), 1)
	#print(n)
	#print(m)
	ind1 = sample(1:n, m, replace=FALSE)
	ind2 = setdiff(1:n, ind1)
	print(ind1)
	print(ind2)
	df1 = MergeDfList(dfList=dataList, ind=ind1, fcn=rbind)
	df2 = MergeDfList(dfList=dataList, ind=ind2, fcn=rbind)
	#print('***')
	#print(dfList)
	outList = list(df1=df1, df2=df2)
	return(outList)
}

Example = function() {
	n = 100
	x = sample(c('boz', 'asb', 'sag'), n, replace=TRUE)
	y = sample(c('yazd', 'rafsanjan', 'kerman'), n, replace=TRUE)
	z = rnorm(n)
	u = z + rnorm(n)
	df = data.frame(x, y, z, u)
	res = DivideDataWrt(df, colNames=c('x', 'y'))
	res
}

## return info about the class of the df columns
## corrects the class from numeric and integer to factor 
##if the levels are not many
CorrectClassDf = function(df, levelsCutoff=0) {
	colNames = names(df)
	n = length(colNames)
	for (i in 1:n) {
		colName = colNames[i]
		c = class(df[ , colName])[1]
		l = length(unique(df[ , colName]))
		print(paste(colName, c, sep=': '))
		print(paste('the cardinality of the levels of this variable is:', l))
		if ((c=='integer' | c=='numeric') & l<levelsCutoff) 
		{
			df[ , colName] = as.factor(df[ , colName])
		}
	}
	return(df)
}

Example = function() {
	df = trainData
	df = CorrectClassDf(df=df, levelsCutoff=3)
}

## mapping some factors of x (given in 'from') 
# to other factors (given in 'to')
FactorMap = function(x, from, to) {
	
	if (length(from) != length(to)) {
		print('Warning: from and to size do not match'); 
		return(NULL)
	}

	x = factor(x)
	levels(x) = c(levels(x), to)
	k = length(from)
	for (i in 1:k) {
		fac = from[i]
		newfac = to[i]
		ind = which(x == fac)
		if (length(ind) > 0) {
			x[ind] = newfac
		}
	}
	x=factor(as.character(x))
	levels(x) = setdiff(levels(x), from)
	return(x)
}

Example = function() {
	x = sample(c('a', 'b', 'c'), 100, replace=TRUE) 
	x[100] = 'd'       
	x = factor(x)
	
	from = c('a', 'd')
	to = c('u', 'v')
	y = FactorMap(x, from, to)
}

## builds a function which when 
NonExistFactorMap = function(facNames, toFacName) {
	Func = function(y) {
		yFacNames = levels(y)
		nonExistInd = which(!(yFacNames %in% facNames))
		from = yFacNames[nonExistInd]
		to = rep(toFacName, length(from))
		FactorMap(y, from=from, to=to)
	}
	return(Func)
}

## deprecated
NonExistFactorMap2 = function(x, facNames, toFacName) {
	xTab = table(x)
	xFacNames = names(xTab)
	nonExistInd = which(!(xFacNames %in% facNames))
	from = xFacNames[nonExistInd]
	to = rep(toFacName, length(from))
	
  Func = function(y) {
		FactorMap(y, from=from, to=to)
	}
	
  return(Func)
}

Example = function() {
   facNames = c('a', 'b', 'd')
	toFacName = 'zzz'
	y = c('a', 'b', 'g')
	y = factor(y)
	Fcn = NonExistFactorMap(facNames, toFacName)
}   
	
### find out the levels of a factor and assign 
CorrectFactorFcn = function(x, cutoff=3) {
	x = factor(x)
	#levels(x)
	#summary(x)
	tab = table(x)
	facNum = length(tab)
	facNames = names(tab)
	freqInd = which(tab == max(tab))[1]
	freqFac = facNames[freqInd]
	toFacName = freqFac
	from = NULL
	to = NULL
	smallFreqInd = which(tab < cutoff)
	from = facNames[smallFreqInd]
	to = rep(freqFac, length(from))
	
	SmallFreq = function(y) {
		FactorMap(y, from=from, to=to)        
	}
	
	x = SmallFreq(x)
	NonExist = NonExistFactorMap(
		facNames=facNames, 
		toFacName=toFacName)
	
	Combine = function(y) {
		SmallFreq(NonExist(y))
	}
	
	outList = list('x'=x, 'SmallFreq'=SmallFreq, 
		'NonExist'=NonExist, 'Combine'=Combine)
	return(outList)
}

Example = function() {
	x = sample(c('a', 'b', 'c'), 100, replace=TRUE) 
	x[100] = 'd'       
	res = CorrectFactorFcn(x)
	newX= res$x    
	SmallFreq = res[[2]]
	NonExist = res[[3]]    
	Combine = res[[4]]    
	y = c('d', 'd', 'd', 'd')
	SmallFreq(y)    
	y = c('d', 'd', 'zereshk')
	y = factor(y)
	NonExist(y)    
	Combine(y)
}

### find out the levels of a factor and assign for dataframes
### this will assign little-appearing factors to most frequent factor
### this will also assign non-existing factors to most frequent factor
CorrectFactorDf = function(df, cutoff=3) {
	
	colNames = names(df)
	n = length(colNames)
	fcnList = list()
	for (i in 1:n) {
		colName = colNames[i]
		c = class(df[ , colName])[1]
		fcnList[[i]] = (f=function(x){return(x)})
		if (c == 'factor') {
			out = CorrectFactorFcn(df[ , colName], cutoff=cutoff)
			df[ , colName] = out[['x']]
			fcnList[[i]] = out[['Combine']] 
		}
	}
	outList = list(df=df, fcnList=fcnList, Fcn=FcnListDfFcn(fcnList))
	return(outList)
}

Example = function() {
	
  x = sample(c('a', 'b', 'c'), 100, replace=TRUE) 
	x[100] = 'd'    
	y = sample(c('aa', 'bb', 'cc'), 100, replace=TRUE) 
	y[99]='dd'
	y[100] = 'dd'    
	z = 1:100    
	df = data.frame(x=x, y=y, z=z)     
	res = CorrectFactorDf(df, cutoff=3)
	df2 = res[['df']]
	fcn = res[['Fcn']]

	x = sample(c('a', 'b', 'e'), 100, replace=TRUE) 
	x[100] = 'd'
	y = sample(c('aa', 'ee', 'cc'), 100, replace=TRUE) 
	y[99]='dd'
	y[100] = 'dd'
	z = 1:100
	newDf = data.frame(x=x, y=y, z=z)
	fcn(newDf)

	fcnList = res[['fcnList']]
	FcnListDfFcn(fcnList)(newDf)
}
	
### from a factor variable build numeric columns and replace in df
FactorDfNumeric = function(df) {
	
  m = dim(df)[2]
	outDf = NULL
	ind = NULL
	count = 0
	
  for (i in 1:m) {
		x = df[ , i]
		if (class(x) == 'factor') {
			name = colnames(df)[i]
			colnames(df)[i] = paste(name, '_', sep='')
			name = colnames(df)[i]
			ind = c(ind, i)
			#print(class(x))
			string = paste('xTrans = model.matrix( ~ ', name, '-1, data=df)', sep='')
			#print(string)
			eval(parse(text=string))
			xTrans = data.frame(xTrans)
			xTrans2 = xTrans[ , -1, drop=FALSE]
			if (count == 0) {
				outDf = xTrans2
			} else {
				outDf = cbind(outDf, xTrans2)
			}
			count = count + 1
		}
	}
	df2 = df
	if (length(ind) > 0) {
		df2 = df2[ , -ind, drop=FALSE]
		df2 = cbind(df2, outDf)
	}
	return(df2)
}

Example = function() {
	df = data.frame(x=1:10, y=c(rep('a', 5), rep('b', 5)),
		z=c(rep('c', 5), rep('d', 5)))
	FactorDfNumeric(df)
}

Example = function() {
	dates = c("2011-01-01 ", "2011-01-01", "2011-01-01")
	hrs = c('0', '1', '21')
	df = data.frame(date=dates, hr=hrs)

	BuildDatetime(df=df, dateCol='date', hrCol='hr')
}

### Fourier series
Fseries = function(data, colName, omega=2*pi, ord=1) {
	
  asc = as.character
	n = dim(data)[1]
	outDf = matrix(NA, n, 2*ord)
	outDf = data.frame(outDf)
	x = data[ , colName]
	columns = list()
	ind = 0 
	
  for (k in 1:ord) {
		sincoln = paste('sin', asc(k), '_', colName, sep='')
		coscoln = paste('cos', asc(k), '_', colName, sep='')
		columns = append(columns, sincoln)
		columns = append(columns, coscoln)
		u = omega*k*x
		sin = sin(u)
		cos = cos(u)
		ind = ind + 1
		outDf[ , ind] = sin
		names(outDf)[ind] = sincoln
		ind = ind + 1
		outDf[ , ind] = cos
		names(outDf)[ind] = coscoln
	}
	return(list('df'=outDf, 'columns'=columns)) 
}    
		   
Example = function() {
	
  x = seq(0, 2, 0.01)
	data = data.frame('x'=x)
	df = Fseries(data, colName='x') 
	
	cols = df[['columns']]
	df = df[['df']]
	plot(x, df[ , 2])
}

### multiple F series
FseriesMulti = function(data, colNames, omegas=NA, ords=NA) {
	
  k = length(colNames)
	if (is.na(omegas[1])) {
		omegas = rep(2*pi, length(colNames))
	}
	if (is.na(ords[1])) {
		ords = rep(1, length(colNames))
	}
	outData = NULL
	outColumns = list()
	
	for (i in 1:k) {
		colName = colNames[i]
		omega = omegas[i]
		ord = ords[i]
		FS = Fseries(data=data, colName=colName, omega=omega, ord=ord)
		FSdf = FS[['df']]
		FScolumns = FS[['columns']]
		if (i == 1) {
			outData=FSdf
		} else {
			outData = cbind(outData, FSdf)
		}

		outColumns = append(outColumns, FScolumns)
	}
	return(list('df'=outData, 'columns'=outColumns))        
}                 

Example = function() {
  
	x = seq(0, 2, 0.01)
	y = 2*seq(0, 2, 0.01)
	z = 4*x
	data = data.frame('x'=x, 'y'=y, 'z'=z)
	colNames = colnames(data)
	
	res = FseriesMulti(data, colNames=c('x', 'y', 'z'), omegas=NA, ord=NA) 
	df = res[['df']]
	colNames = res['columns']
	 
	plot(df[ , 1] ,df[ , 2])
	plot(x, df[ , 1])
}

### create an interaction matrix using two explan matrices: explan1, explan2
### the interactions needed are taken from interactList
### every tuple in interact list starts with an index in explan1
### then it follows by indices in interact2
CreateInteractFcn = function(
    interactList=NULL,
    interactMatrix=NULL,
    sep='_') {
	
  #print(interactList)
	#print(interactMatrix)
	if (is.null(interactList)) {
		x = interactMatrix
		interactList = split(t(x), rep(1:ncol(t(x)), each=nrow(t(x))))
	}
	print(interactList)
	
	Func = function(explan1, explan2) {
		k = length(interactList)
		k1 = dim(explan1)[2]
		k2 = dim(explan2)[2]

		n1 = dim(explan1)[1]
		n2 = dim(explan2)[1]
		if (n1 != n2) {
			print('explan1 and explan2 row nums do not match')
		}
		explanInteract = NULL
		for (i in 1:k) {
			index = interactList[[i]]
			explan1_index = index[1]
			explan2_index = index[-1]
			for (j in explan2_index) {
				explanInteract = cbind(explanInteract, 
					(explan1[ , explan1_index]*explan2[ , j]))
			}
		}

		explanInteract = data.frame(explanInteract)

		count = 0
		for (i in 1:k) {
			index = interactList[[i]]
			explan1_index = index[1]
			explan2_index = index[-1]
			if (!is.null(explan2_index)) {
				for (j in explan2_index) {
					count = count + 1
					name = paste('int', explan1_index, j, sep=sep)
					name1 = names(explan1)[explan1_index]
					name2 = names(explan2)[j]
					#print(name1);print(j);print(names(explan2));print(name2)
					if (!is.null(name1) & !is.null(name2)) {
						name=paste(name, sep, name1, sep, name2, sep='')
					}
					names(explanInteract)[count] = name
				}
			}
		}
		return(explanInteract)
	}
	return(Func)
}

Example = function() {
	
  explan1 = matrix(1:9, 3, 3)
	explan2 = matrix(11:19, 3, 3)
	interactList = list(c(1, 1, 2), c(2, 1, 2))
	fcn = CreateInteractFcn(interactList)
	fcn(explan1, explan2)
	explan1 = matrix(1:9, 3, 3)
	explan2 = matrix(11:19, 3, 3)
	explan1 = data.frame(explan1)
	explan2 = data.frame(explan2)
	names(explan1) = c('a', 'b', 'c')
	names(explan2) = c('t', 'u', 'v')

	interactList = list(c(3, 1, 2, 3), c(1, 2))
	CreateInteractFcn(interactList)(explan1, explan2)
	interactMatrix = rbind(c(1, 2), c(2, 3), c(1, 1))
	CreateInteractFcn(interactMatrix=interactMatrix)(explan1, explan2)
}

### randomForest prediction 
RfFit = function(y, explan) {
	
  InstallLoad('randomForest')
	mod = randomForest(y ~ ., data=explan)
	return(mod)

}

Example = function() {
	n = 100
	explan = matrix(rnorm(n*10), n, 10)
	explan = data.frame(explan)
	yObs = 5*explan[ , 1] + explan[ ,2]+rnorm(n)
	mod = RfFit(yObs, explan)
}    


## fitting via aggregate
AggFit = function(
	  y,
    explan,
    yName='value',
    wrtCols=NULL, 
	  AggFunc=mean,
    aggrFuncName='mean') {
	  explan = data.frame(explan)
	
	if (length(wrtCols) == 0) {
		wrtCols=colnames(explan)
	}
	
	data = cbind(y,explan)
	colnames(data)[1] = yName

	aggModDf = SummWrt(
		  data,
      valueCol=yName,
      wrtCols=wrtCols,
      probs=NULL, 
		  fcnList=list(AggFunc),
      fcnName='mean') 
	
  aggMod = list(df=aggModDf)
	class(aggMod) = 'aggMod'
	return(aggMod)
}

AggPred = function(aggMod, explan) {
	colNames = colnames(aggMod$df)[-length(colnames(aggMod$df))]
	yName = colnames(aggMod$df)[length(colnames(aggMod$df))]
	#print(dim(explan))
	explan[ , 'orderTemp'] = 1:dim(explan)[1]
	out = merge(explan, aggMod[['df']], by=colNames, all.x=TRUE)
	out = out[order(out[ , 'orderTemp']), ]
	yPred = out[ , yName]
	return(yPred)
}

predict.aggMod = function(aggMod, explan, ...) {
	return(AggPred(aggMod, explan))
}

print.aggMod = function(aggMod) {
	print(aggMod[['df']])
}

summary.aggMod = function(aggMod) {
	return(aggMod[['df']])
}

## remove Agg Value such as mean, median
RemoveAggValue = function(
	data,
  colName,
  wrtCols=NULL, 
	AggFunc=mean,
  aggrFuncName='mean',
	addToData=TRUE,
  newColName=NULL) {
	
	aggMod = AggFit(y=data[ , colName], explan=data, yName=colName, 
		wrtCols=wrtCols, AggFunc=AggFunc, aggrFuncName=aggrFuncName) 
	yPred = AggPred(aggMod, explan=data)
	out = data[ , colName] - yPred
	if (addToData) {
		out = cbind(data, out)
		if (is.null(newColName)) {
			newColName = paste(colName, '_', aggrFuncName, '_removed', sep='')
		}
		colnames(out)[dim(out)[2]] = newColName
	}
	return(out)
}

Example = function() {
	n = 50
	u1 = sample(1:3, n, replace=TRUE)
	x1 = factor(u1, labels = c("p1","p2","p3"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels = c("rich", "poor"))
	y = (
		0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) +
		20*(u1==5) + 1*(u2==1) + rnorm(n))
	explan = data.frame(x1=x1, x2=x2)
	data = cbind(y,explan)
	RemoveAggValue(data,colName='y', wrtCols=c('x1', 'x2'), AggFunc=mean, 
		aggrFuncName='mean', addToData=TRUE)
}

###### imputation
### impute col
ImputeCol = function(
    df,
    valueCol,
    wrtCols,
    AggFunc=NULL,
    aggrFuncName="agg") {
	
  if (is.null(AggFunc)) {
		AggFunc = function(x){mean(x, na.rm=TRUE)}
	}
	
  y = df[ , valueCol]
	
  aggMod = AggFit(
      y=y,
      explan=df,
      yName=valueCol,
		  wrtCols=wrtCols,
      AggFunc=AggFunc,
      aggrFuncName=aggrFuncName) 
	
  yPred = AggPred(aggMod, explan=df)
	
	ind = which(is.na(y))
	out = y
	out[ind] = yPred[ind]
	return(out)
}

Example = function() {
	n = 50
	u1 = sample(1:3, n, replace=TRUE)
	x1 = factor(u1, labels = c("p1", "p2", "p3"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels = c("rich", "poor"))
	y = 0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) + 20*(u1==5) + 1*(u2==1) + rnorm(n)
	y[1:5] = NA
	explan = data.frame(x1=x1, x2=x2)
	data = cbind(y, explan)
	ImputeCol(df=df, valueCol='y', wrtCols=c('x1', 'x2'), AggFunc=NULL)
}

### impute data
ImputeDf = function(
  	df,
	  valueCols,
	  wrtCols,
	  AggFunc=NULL,
	  replaceCol=TRUE) {
	
	out = NULL
	for (i in 1:length(valueCols)) {
		valueCol = valueCols[i]
		
    newCol = ImputeCol(
        df=df,
        valueCol=valueCol,
        wrtCols=wrtCols,
        AggFunc=NULL)
		
    out = cbind(out, newCol)
		colnames(out)[i] = valueCol
		if (replaceCol) {
			df[ , valueCol] = newCol
		} 
	}
	
	if (replaceCol) {
		out = df
	}

	return(out)

}

Example = function() {
	n = 50
	u1 = sample(1:3, n, replace=TRUE)
	x1 = factor(u1, labels = c("p1","p2","p3"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels = c("rich", "poor"))
	y1 = (
      0*(u1 == 1) +
      8*(u1 == 2) +
      10*(u1 == 3) +
      15*(u1 == 4) +
      20*(u1 == 5) +
      1*(u2 == 1) + rnorm(n))

	y2 = (
      0*(u1 == 1) +
      3*(u1 == 2) +
      1*(u1 == 3) +
      5*(u1 == 4) +
      25*(u1 == 5) +
      1*(u2==1) + rnorm(n))
	
  y1[1:5] = NA
	y2[5:10] = NA
	explan = data.frame(x1=x1, x2=x2)
	df = cbind(y1, y2, explan)
	ImputeDf(df=df, valueCols=c('y1', 'y2'), wrtCols=c('x1', 'x2'))
}

#setMethod(predict, aggMod, .aggMod)
Example = function() {
	n = 500
	u1 = sample(1:5, n, replace=TRUE)
	x1 = factor(u1, labels = c("p1", "p2", "p3", "p4", "p5"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels = c("rich", "poor"))
	y = (
      0*(u1 == 1) +
      8*(u1 == 2) +
      10*(u1 == 3) +
      15*(u1 == 4) +
      20*(u1 == 5) +
      1*(u2 == 1) + rnorm(n))

	explan = data.frame(x1=x1,x2=x2)
	aggMod = AggFit(y, explan)
	yPred = AggPred(explan, aggMod)
	plot(y, yPred)
	yPred = predict(aggMod, explan)
}

Example = function() {
	n = 500
	u1 = sample(1:5, n, replace=TRUE)
	x1 = factor(u1, labels = c("p1", "p2", "p3", "p4", "p5"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels = c("rich", "poor"))
	 y = (
      0*(u1 == 1) +
      8*(u1 == 2) +
      10*(u1 == 3) +
      15*(u1 == 4) +
      20*(u1 == 5) +
      1*(u2 == 1) + rnorm(n))

	explan = data.frame(x1=x1, x2=x2)
	aggMod = AggFit(y, explan)
	yPred = predict(aggMod, explan)
	plot(y, yPred)

	FitAssess(yObs=y, explan=explan, model='Agg', wrtCols=NULL)
}

## fit dev models
DevModelFit = function(
  	y,
	  explanBasic,
	  explanDev,
	  basicModel,
	  devModel, 
	  devFcn=NA,
	  devInvFcn=NA,
	  form='additive',
	  wrtCols=NULL,
	  family='gaussian') {
	
	if (!is.null(form) & form == 'additive') {
		devFcn = function(x, y)(x - y)
		devInvFcn = function(x, y)(x + y)
	}
	
	if (!is.na(form) & form=='multi') {
		devFcn = function(x, y)(x/y)
		devInvFcn = function(x, y)(x*y)
	}
	
	bmod = basicModel(y, explan=explanBasic)
	yFitBasic = predict(bmod, explan=explanBasic, wrtCols=wrtCols)
	e = devFcn(y, yFitBasic)
	dmod = devModel(e, explanDev)
	eFit = predict(dmod, explanDev)
	out = list(bmod=bmod, dmod=dmod, devFcn=devFcn, devInvFcn=devInvFcn)
	class(out) = 'devMod'
	return(out)
}

print.devMod = function(devMod) {
	bmod = devMod[['bmod']]
	dmod = devMod[['dmod']]
	print('basic model')
	print(bmod)
	print('dev model')
	print(dmod)
}

predict.devMod = function(devMod, explanBasic, explanDev, ...) {
	bmod = devMod[['bmod']]
	dmod = devMod[['dmod']]
	devFcn = devMod[['devFcn']]
	devInvFcn = devMod[['devInvFcn']]
	yPredBasic = predict(bmod, explanBasic)
	ePred = predict(dmod, explanDev)
	yPred = devInvFcn(yPredBasic, ePred) 
	return(yPred)
}

Example = function() {
	n = 500
	u1 = sample(1:5, n, replace=TRUE)
	x1 = factor(u1, labels = c("p1", "p2", "p3", "p4", "p5"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels = c("rich", "poor"))
	x3 = rnorm(500)
	x4 = rnorm(500)
	y = 0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) + 20*(u1==5) + 1*(u2==1) + 
		2*x3 + 3*x4 + rnorm(n)

	explanBasic = data.frame(x1=x1, x2=x2)
	explanDev = data.frame(x3, x4)

	# glm
	devMod = DevModelFit(
      y, explanBasic=explanBasic, explanDev=explanDev,
		  basicModel=AggFit, devModel=GlmFitMatr, devFcn=NA,
      devInvFcn=NA, form='additive')

	# RF
	devMod = DevModelFit(
      y, explanBasic=explanBasic, explanDev=explanDev,
		  basicModel=AggFit, devModel=RfFit, devFcn=NA,
      devInvFcn=NA, form='additive')

	yFit = predict(
      devMod, explanBasic=explanBasic, explanDev=explanDev)
	
  plot(y, yFit)
	print(devMod)
	
	gmod = GlmFitMatr(y, explan)
	yFit = predict(gmod, explan=explan)
	plot(y, yFit)
}

## Fitting various algorithms in R
FitAssess = function(
    yObs,
    explan=NULL,
    model, 
  	indTrain=NULL,
    indValid=NULL,
    obsDamping=TRUE,
  	ymin=NULL,
    ymax=NULL,
    wrtCols=NULL,
    explanBasic=NULL, 
  	explanDev=NULL,
    basicModel=NULL, 
  	devModel=NULL,
    devForm='additive',
    family='gaussian', 
  	larsType='lasso',
    divideColNames=NULL, 
  	divideProp=1/2,
    explanDivide=NULL,
  	divideRangeCols=NULL,
    divideRangeList=NULL) {
	
	n = length(yObs)
	divideOrder = 1:n
 	
	if (!is.null(divideColNames)) {
		data = cbind(divideOrder, explanDivide)
		colnames(data)[1] = 'divideOrder'
		res = DivideDataWrt(df=data, prop=divideProp,
			colNames=divideColNames)
		dataTrain = res[[1]]
		dataValid = res[[2]]
		indTrain = dataTrain[ , 'divideOrder']
		indValid = dataValid[ , 'divideOrder']
	} 
	
	if (!is.null(divideRangeCols)) {
		logicalInd = rep(1, n)
		for (i in 1:length(divideRangeCols)) {
			colName = divideRangeCols[i]
			x = explanDivide[ , colName]
			colRange = divideRangeList[[i]]
			logicalInd = (x < colRange[2] & x > colRange[1])*logicalInd   
		}
		indTrain = which(logicalInd>0)
		indValid = setdiff(1:n, indTrain)
	}
	
	
	if (is.null(indTrain)) {
		m = round(n/2)
		indTrain = sample(1:n, m)
	}
	
	if (is.null(indValid)) {
		indValid = setdiff(1:n, indTrain)
	}
	
	yTrain = yObs[indTrain]
	yValid = yObs[indValid] 
	explanTrain = explan[indTrain, , drop=FALSE]
	explanValid = explan[indValid, , drop=FALSE]
	
	yminObs = min(yTrain)
	ymaxObs = max(yTrain)
	
	if (model == 'RF') {
		InstallLoad('randomForest')
		mod = RfFit(y=yTrain, explan=explanTrain)
		yFit = predict(mod, newdata=explanTrain)
		yPred = predict(mod, newdata=explanValid)
	}
	
	if (model == 'GLM') {
		mod = GlmFitMatr(y=yTrain, explan=explanTrain, family=family)
		yFit = predict(mod, explan=explanTrain)
		yPred = predict(mod, explan=explanValid)
	}
	
	if (model == 'LARS') {
		InstallIf('lars')
		library('lars')
		xTrain  = FactorDfNumeric(explanTrain)
		xTrain = as.matrix(xTrain)
		p = dim(xTrain)[2]
		mod = lars(x=xTrain, y=t(yTrain), type=larsType)
		res = predict.lars(mod, newx=xTrain, type='fit')
		fit = res[["fit"]]
		yFit = fit[,(p+1)]
		xValid = FactorDfNumeric(explanValid)
		xValid = as.matrix(xValid)
		res = predict.lars(mod, newx=xValid)
		pred = res$fit
		yPred = pred[ , (p+1)]
	}
		
	if (model == 'PLS') {
		InstallIf('pls')
		library('pls')
		ncomp=10
		#X1 = explan_all
		xTrain = as.matrix(explan[indTrain, ])
		#PLSmod = mvr(y1~X1,  ncomp=6 , data = mydata)
		mod = plsr(yTrain ~ xTrain, ncomp=ncomp)
		fit = predict(mod, newdata=xTrain)
		yFit = fit[ , , ncomp]
		yFit = MaxComp(yFit, 0)
		xValid = explan[indValid, ]
		xValid = as.matrix(xValid)
		pred = predict(mod, newdata=xValid)
		yPred = pred[ , , ncomp]
	}
	
	
	if (model == 'Agg') {
		mod = AggFit(y=yTrain, explan=explanTrain, wrtCols=wrtCols)
		yFit = AggPred(mod, explan=explanTrain)
		yPred = AggPred(mod, explan=explanValid)
	}
	
	if (model == 'Dev') {
		if (is.null(explanBasic)) {explanBasic=explan}
		if (is.null(explanDev)) {explanDev=explan}
		explanBasicTrain = explanBasic[indTrain, ]
		explanBasicValid = explanBasic[indValid, ]
		explanDevTrain = explanDev[indTrain, ]
		explanDevValid = explanDev[indValid, ]
		mod = DevModelFit(
			  yTrain, explanBasic=explanBasicTrain, explanDev=explanDevTrain,
			  basicModel=basicModel, devModel=devModel, devFcn=NA, devInvFcn=NA,
			form=devForm, wrtCols=wrtCols)
		
    yFit = predict(mod, explanBasic=explanBasicTrain, explanDev=explanDevTrain)
		yPred = predict(mod, explanBasic=explanBasicValid, explanDev=explanDevValid)
	}
	
	summary(mod)
	
	if (obsDamping) {
		yFit = MaxComp(yFit, yminObs)
		yFit = MinComp(yFit, ymaxObs)
	}
	if (obsDamping) {
		yPred = MaxComp(yPred, yminObs)
		yPred = MinComp(yPred, ymaxObs)
	}
	
	if (!is.null(ymin)) {
		yFit = MaxComp(yFit, ymin)
		yPred = MaxComp(yPred, ymin)
	}
	if (!is.null(ymax)) {
		yFit = MinComp(yFit, ymax)
		yPred = MaxComp(yPred, ymin)
	}
	
	indOK = which(!is.na(yPred))
	num = length(yPred) - length(indOK)
	if (num > 0) {
		print('*** Warning: this number of predictions are NA:')
		print(num)
	}
	
	par(mfrow=c(1, 2))
	plot(yTrain, yFit, col=rgb(0,  0, 0, alpha=0.1), pch=20)
	print('fit cor and rmse')
	print(cor(yTrain, yFit))
	abline(0, 1, col='blue')
	rmse = sqrt(mean((yTrain-yFit)^2))
	print(rmse)
	plot(yValid,yPred,col=rgb(0, 0, 0, alpha=0.1), pch=20)
	abline(0, 1, col='blue')
	print('test cor and rmse')
	print(cor(yValid[indOK], yPred[indOK]))
	rmse = sqrt(mean((yValid[indOK] - yPred[indOK])^2))
	print(rmse)
	return(mod)
}

Example = function() {
	### TRY GLM
	n = 100
	explan = matrix(rnorm(n*10), n, 10)
	explan = data.frame(explan)
	yObs = 5*explan[ , 1] + explan[ , 2] + rnorm(n)
	mod = FitAssess(yObs, explan, model='GLM')
	
	### TRY LARS
	n = 100
	explan = matrix(rnorm(n*10), n, 10)
	explan = data.frame(explan)
	yObs = 5*explan[ , 1] + explan[ , 2] + rnorm(n)
	mod = FitAssess(yObs, explan, model='LARS')
	
	### TRY Random Forest
	n = 100
	explan = matrix(rnorm(n*10), n, 10)
	explan = data.frame(explan)
	yObs = 5*explan[ , 1] + explan[ , 2] + rnorm(n)
	mod = FitAssess(yObs, explan, model='RF')
	
   
  ### TRY Dev models
	n = 400
	u1 = sample(1:3, n, replace=TRUE)
	x1 = factor(u1, labels=c("p1", "p2", "p3"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels=c("rich", "poor"))
	x3 = rnorm(n)
	x4 = rnorm(n)
	yObs = 0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) + 20*(u1==5) + 1*(u2==1) + 
		2*x3 + 3*x4 + 4*x3*(x4>1) + 7*(x3>2)*x4 + 5*cos(2*pi*x3) * rnorm(n)

	explanBasic = data.frame(x1=x1, x2=x2)
	explanDev = data.frame(x3, x4)
	explan = cbind(explanBasic, explanDev)
	
	mod = FitAssess(yObs, explanBasic=explanBasic, explanDev=explanDev,
		model='Dev', basicModel=AggFit, devModel=GlmFitMatr)
	
	mod = FitAssess(yObs, explanBasic=explanBasic, explanDev=explanDev,
		model='Dev', basicModel=AggFit, devModel=RfFit)
	
	## divide data wrt specific columns
	mod = FitAssess(yObs, explan=cbind(explanBasic, explanDev), model='GLM')
	
	
	mod = FitAssess(yObs, explan=cbind(explanDev), model='GLM',
		divideColNames=c('x1', 'x2'), divideProp=1/2, explanDivide=explanBasic)
	
	mod = FitAssess(yObs, explan=cbind(explanDev), model='GLM',
		explanDivide=explan[ , c('x3', 'x4')], divideRangeCols=c('x3', 'x4'), 
		divideRangeList=list(c(0, 1), c(0, 2)))
	
	
	dig_deeper = function() {
		basicModel = AggFit
		devModel = GlmFitMatr
		indTrain = 1:250
		indValid = 250:500
		explanBasicTrain = explanBasic[indTrain, ]
		explanBasicValid = explanBasic[indValid, ]
		explanDevTrain = explanDev[indTrain, ]
		explanDevValid = explanDev[indValid, ]
		mod = DevModelFit(
			yTrain, explanBasic=explanBasicTrain, explanDev=explanDevTrain,
			basicModel=basicModel, devModel=devModel, devFcn=NA, devInvFcn=NA,
			form=devForm,
			wrtCols=wrtCols)
		yFit = predict(mod, explanBasic=explanBasicTrain, explanDev=explanDevTrain)
		yPred = predict(mod, explanBasic=explanBasicValid, explanDev=explanDevValid)
	}
}

### categorical
FitAssessCat = function(
    y, explan, model, indTrain=NA, indValid=NA) {
	
  n = dim(explan)[1]
	if (is.na(sum(indTrain))) {
		m=round(n/2)
	}
	
	indTrain = sample(1:n, m)
	indValid = setdiff(1:n, indTrain)
	yTrain = y[indTrain]
	yValid = y[indValid] 
	explanTrain = explan[indTrain, ]
	explanValid = explan[indValid, ]
	
	if (model == 'RF') {
		mod = RfFit(y=yTrain, explan=explanTrain)
		summary(mod)
		yFit = predict(mod)
		yPred = predict(mod, newdata=explanValid)
	}
	
	
	if (model == 'nnet') {
		library('nnet')
		size = NA
			# scale inputs: divide by 50 to get 0-1 range
		mod = nnet(yTrain ~ ., data=X1, size=size) 
		 # multiply 50 to restore original scale
		fit = predict(mod)
		L = length(yTrain)
		yFit = rep(NA, L)
		NAMES = levels(yTrain)
		for (i in 1:L) {
			ind = which(fit[i, ] == max(fit[i, ]))[1]
			yFit[i] = NAMES[ind]
		}

		pred = predict(mod, newdata=xValid)
		L = length(yValid)
		yPred = rep(NA, L)
		NAMES = levels(yValid)
		for (i in 1:L) {
			ind = which(pred[i, ]==max(pred[i, ]))[1]
			yPred[i] = NAMES[ind]
		}
	}
	
	
	if (model == 'RSNNS') {
		library('RSNNS')
		mod = mlp(x=xTrain, y=yTrain)
		yFit = predict(mod)
		yPred = predict(mod, newdata=xValid)
	}
	
	if (model == 'multiLogistic') {
		mod = multinom(formula=yTrain ~ ., data=xTrain)
		yFit = predict(mod)
		yPred = predict(mod, data=xValid)
		res = categPredErr(yTrain, yFit)
	}
	
	if (model == 'SVM') {
		library('e1071')
		mod = svm(y=yTrain, x=xTrain)
		yFit = predict(mod)
		yPred = predict(mod, newdata=xValid)
	}
	
	par(mfrow=c(1, 2))
	plot(yTrain, yFit)
	plot(yValid, yPred)
	 
	res = categPredErr(yTrain, yFit)
	print('Fit errors')
	print(res)
	res = categPredErr(yValid, yPred)
	print('Pred Errors')
	return(mod)
}

### model matrix
## gets a list, for a list element which is a vector of variables build interactions
## using DesignMatFcn below should be better
ModelMatrixViaList = function(formulaList, data, dropIntercept=TRUE) {
	colsToText = function(colNames) {
		for (k in 1:length(colNames)) 
		{
			if (k == 1) {
				out = colNames[1]
			} else {
				out = paste(out, '*', colNames[k], sep='')
			}
		}
		 return(out)
	}

	formulaText = '~'
	n = length(formulaList)
	for (i in 1:n) {
		colNames = formulaList[[i]]
		print(colNames)
		Text = colsToText(colNames)
		print(Text)
		if (i == 1) {
			formulaText = paste(formulaText, Text, sep='')
		}
		if (i > 1) {
			formulaText = paste(formulaText,'+', Text, sep='')
		}
	}
	print(formulaText)
	
	runText = paste('outData = model.matrix(', formulaText, ', data=data)', sep='')
	eval(parse(text=runText))
	if (dropIntercept) {outData = outData[ , -1 , drop=FALSE]}
	return(outData)
}

Example = function() {

   ### TRY Dev models
	n = 50
	u1 = sample(1:3, n, replace=TRUE)
	x1 = factor(u1,labels=c("p1","p2","p3"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels=c("rich", "poor"))
	x3 = rnorm(n)
	x4 = rnorm(n)
	yObs = 0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) + 20*(u1==5) + 1*(u2==1) + 
		2*x3 + 3*x4 + 4*x3*(x4>1) + 7*(x3>2)*x4 + 5*cos(2*pi*x3) * rnorm(n)

	data = data.frame(x1=x1, x2=x2, x3=x3, x4=x4)
	formulaList = list('x1', c('x1', 'x2'))
	
	formulaList = list('x1')
	res = ModelMatrixViaList(formulaList, data)

	formulaList = list('x1', 'x2')
	ModelMatrixViaList(formulaList, data)


	formulaList = list('x2')
	ModelMatrixViaList(formulaList, data)
	
	formulaList = list('x1*x2')
	ModelMatrixViaList(formulaList, data)
	
	formulaList = list('x1+x1*x2')
	ModelMatrixViaList(formulaList, data)
	
	formulaList = list('x1+x2+x1*x2')
	ModelMatrixViaList(formulaList, data)
}

Example = function() {
	### TRY Dev models
	n = 50
	x1 = rnorm(n)
	x2 = rnorm(n)
	x3 = rnorm(n)
	x4 = rnorm(n)
	
	data = data.frame(x1=x1, x2=x2, x3=x3, x4=x4)
	formulaList = list('x1', c('x1', 'x2'))
	
	formulaList = list('x1')
	ModelMatrixViaList(formulaList, data)

	formulaList = list('x1', 'x2')
	ModelMatrixViaList(formulaList, data)


	formulaList = list('x2')
	ModelMatrixViaList(formulaList, data)
	
	formulaList = list('x1*x2')
	ModelMatrixViaList(formulaList, data)
	
	formulaList = list('x1+x1*x2')
	ModelMatrixViaList(formulaList, data)
	
	formulaList = list('x1+x2+x1*x2')
	ModelMatrixViaList(formulaList, data)
}

### build a new function which removes NA
RemoveNAfcn = function(f) {
	G = function(x, ...) {
		f(x, na.rm=TRUE, ...)
	}
	return(G)
}

Example = function() {
	f = mean
	x = c(1:10, NA)
	f(x)
	RemoveNAfcn(f)(x)
}

## written first for uber excercise
## if the model is not glm then we need to get the design matrix (numerical matrix)
DesignMatFcn = function(df, formulaString, DfTrans=function(x){x}) {
	#print('DesignMatFcn: data')
	formula = as.formula(formulaString)
	## design matrix
	#print('DesignMatFcn: data')
	#print(data[1, ])

	
	## this builds the design matrix for current data 
	## as well as future data
	## for current data we pass nothing
	Fcn = function(newdata=NULL) {
		#print('DesignMatFcn: dim newdata'); print(dim(newdata))
		newdata = DfTrans(newdata)
		df2 = rbind(df, newdata)
		# print('dim data2'); print(dim(data2))
		m = model.frame(formula=formula, data=df2)
		x2 = model.matrix(formula, m)
		x2 = matrix(x2, dim(x2)[1], dim(x2)[2])
		#print(dim(x2))
		nData = dim(df)[1]
		nNewdata = dim(newdata)[1]
		#print('nData'); print(nData); print('nNewdata'); print(nNewdata); print('sum')
		#print(nData+nNewdata); print('dim x2')
		x = x2[(nData + 1):(nData + nNewdata), , drop=FALSE]
		return(x)
	}
	return(Fcn)
}

Example = function() {

	n = 100
	m = 50
	u1 = sample(1:3, n, replace=TRUE)
	x1 = factor(u1,labels=c("p1","p2","p3"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels=c("rich", "poor"))
	x3 = rnorm(n)
	x4 = rnorm(n)
	y = 0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) + 20*(u1==5) + 1*(u2==1) + 
		2*x3 + 3*x4 + 4*x3*(x4>1) + 7*(x3>2)*x4 + 5*cos(2*pi*x3) * rnorm(n)

	data = data.frame(x1=x1, x2=x2, x3=x3, x4=x4, y=y)[1:m, ]
	newdata = data.frame(x1=x1, x2=x2, x3=x3, x4=x4, y=y)[(m+1):n, ]



	formulaText = 'y~x1+x2'
	formula = as.formula(formulaText)
	m = model.frame(formula=formula, data=data)
}