R/modeling.R

Defines functions DivideData DivideDataWrt Example CorrectClassDf Example FactorMap Example NonExistFactorMap NonExistFactorMap2 Example CorrectFactorFcn Example CorrectFactorDf Example FactorDfNumeric Example Example Fseries Example FseriesMulti Example CreateInteractFcn Example RfFit Example AggFit AggPred predict.aggMod print.aggMod summary.aggMod RemoveAggValue Example ImputeCol Example ImputeDf Example Example Example DevModelFit print.devMod predict.devMod Example FitAssess Example FitAssessCat ModelMatrixViaList Example Example RemoveNAfcn Example DesignMatFcn Example

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# authors: Alireza Hosseini, Reza Hosseini

## a function which partitions the data into two parts
DivideData = function(df, m=NA) {
	n = dim(df)[1]
	if (is.na(m)) {
		m = round(n/2)
	}
	ind1 = sample(1:n, m)
	ind2 = setdiff(1:n, ind1)
	df1 = df[ind1, ]
	df2 = df[ind2, ]
	outList = list(df1=df1, df2=df2)
	return(outList)
}

DivideDataWrt = function(df, prop=1/2, colNames) {
	
	l = list()
	for (i in 1:length(colNames)) {
		l[[i]] = df[ , colNames[i]]
	}

	dataList = split(df, l)
	n = length(dataList)
	m = max(round(n*prop), 1)
	#print(n)
	#print(m)
	ind1 = sample(1:n, m, replace=FALSE)
	ind2 = setdiff(1:n, ind1)
	print(ind1)
	print(ind2)
	df1 = MergeDfList(dfList=dataList, ind=ind1, fcn=rbind)
	df2 = MergeDfList(dfList=dataList, ind=ind2, fcn=rbind)
	#print('***')
	#print(dfList)
	outList = list(df1=df1, df2=df2)
	return(outList)
}

Example = function() {
	n = 100
	x = sample(c('boz', 'asb', 'sag'), n, replace=TRUE)
	y = sample(c('yazd', 'rafsanjan', 'kerman'), n, replace=TRUE)
	z = rnorm(n)
	u = z + rnorm(n)
	df = data.frame(x, y, z, u)
	res = DivideDataWrt(df, colNames=c('x', 'y'))
	res
}

## return info about the class of the df columns
## corrects the class from numeric and integer to factor 
##if the levels are not many
CorrectClassDf = function(df, levelsCutoff=0) {
	colNames = names(df)
	n = length(colNames)
	for (i in 1:n) {
		colName = colNames[i]
		c = class(df[ , colName])[1]
		l = length(unique(df[ , colName]))
		print(paste(colName, c, sep=': '))
		print(paste('the cardinality of the levels of this variable is:', l))
		if ((c=='integer' | c=='numeric') & l<levelsCutoff) 
		{
			df[ , colName] = as.factor(df[ , colName])
		}
	}
	return(df)
}

Example = function() {
	df = trainData
	df = CorrectClassDf(df=df, levelsCutoff=3)
}

## mapping some factors of x (given in 'from') 
# to other factors (given in 'to')
FactorMap = function(x, from, to) {
	
	if (length(from) != length(to)) {
		print('Warning: from and to size do not match'); 
		return(NULL)
	}

	x = factor(x)
	levels(x) = c(levels(x), to)
	k = length(from)
	for (i in 1:k) {
		fac = from[i]
		newfac = to[i]
		ind = which(x == fac)
		if (length(ind) > 0) {
			x[ind] = newfac
		}
	}
	x=factor(as.character(x))
	levels(x) = setdiff(levels(x), from)
	return(x)
}

Example = function() {
	x = sample(c('a', 'b', 'c'), 100, replace=TRUE) 
	x[100] = 'd'       
	x = factor(x)
	
	from = c('a', 'd')
	to = c('u', 'v')
	y = FactorMap(x, from, to)
}

## builds a function which when 
NonExistFactorMap = function(facNames, toFacName) {
	Func = function(y) {
		yFacNames = levels(y)
		nonExistInd = which(!(yFacNames %in% facNames))
		from = yFacNames[nonExistInd]
		to = rep(toFacName, length(from))
		FactorMap(y, from=from, to=to)
	}
	return(Func)
}

## deprecated
NonExistFactorMap2 = function(x, facNames, toFacName) {
	xTab = table(x)
	xFacNames = names(xTab)
	nonExistInd = which(!(xFacNames %in% facNames))
	from = xFacNames[nonExistInd]
	to = rep(toFacName, length(from))
	
  Func = function(y) {
		FactorMap(y, from=from, to=to)
	}
	
  return(Func)
}

Example = function() {
   facNames = c('a', 'b', 'd')
	toFacName = 'zzz'
	y = c('a', 'b', 'g')
	y = factor(y)
	Fcn = NonExistFactorMap(facNames, toFacName)
}   
	
### find out the levels of a factor and assign 
CorrectFactorFcn = function(x, cutoff=3) {
	x = factor(x)
	#levels(x)
	#summary(x)
	tab = table(x)
	facNum = length(tab)
	facNames = names(tab)
	freqInd = which(tab == max(tab))[1]
	freqFac = facNames[freqInd]
	toFacName = freqFac
	from = NULL
	to = NULL
	smallFreqInd = which(tab < cutoff)
	from = facNames[smallFreqInd]
	to = rep(freqFac, length(from))
	
	SmallFreq = function(y) {
		FactorMap(y, from=from, to=to)        
	}
	
	x = SmallFreq(x)
	NonExist = NonExistFactorMap(
		facNames=facNames, 
		toFacName=toFacName)
	
	Combine = function(y) {
		SmallFreq(NonExist(y))
	}
	
	outList = list('x'=x, 'SmallFreq'=SmallFreq, 
		'NonExist'=NonExist, 'Combine'=Combine)
	return(outList)
}

Example = function() {
	x = sample(c('a', 'b', 'c'), 100, replace=TRUE) 
	x[100] = 'd'       
	res = CorrectFactorFcn(x)
	newX= res$x    
	SmallFreq = res[[2]]
	NonExist = res[[3]]    
	Combine = res[[4]]    
	y = c('d', 'd', 'd', 'd')
	SmallFreq(y)    
	y = c('d', 'd', 'zereshk')
	y = factor(y)
	NonExist(y)    
	Combine(y)
}

### find out the levels of a factor and assign for dataframes
### this will assign little-appearing factors to most frequent factor
### this will also assign non-existing factors to most frequent factor
CorrectFactorDf = function(df, cutoff=3) {
	
	colNames = names(df)
	n = length(colNames)
	fcnList = list()
	for (i in 1:n) {
		colName = colNames[i]
		c = class(df[ , colName])[1]
		fcnList[[i]] = (f=function(x){return(x)})
		if (c == 'factor') {
			out = CorrectFactorFcn(df[ , colName], cutoff=cutoff)
			df[ , colName] = out[['x']]
			fcnList[[i]] = out[['Combine']] 
		}
	}
	outList = list(df=df, fcnList=fcnList, Fcn=FcnListDfFcn(fcnList))
	return(outList)
}

Example = function() {
	
  x = sample(c('a', 'b', 'c'), 100, replace=TRUE) 
	x[100] = 'd'    
	y = sample(c('aa', 'bb', 'cc'), 100, replace=TRUE) 
	y[99]='dd'
	y[100] = 'dd'    
	z = 1:100    
	df = data.frame(x=x, y=y, z=z)     
	res = CorrectFactorDf(df, cutoff=3)
	df2 = res[['df']]
	fcn = res[['Fcn']]

	x = sample(c('a', 'b', 'e'), 100, replace=TRUE) 
	x[100] = 'd'
	y = sample(c('aa', 'ee', 'cc'), 100, replace=TRUE) 
	y[99]='dd'
	y[100] = 'dd'
	z = 1:100
	newDf = data.frame(x=x, y=y, z=z)
	fcn(newDf)

	fcnList = res[['fcnList']]
	FcnListDfFcn(fcnList)(newDf)
}
	
### from a factor variable build numeric columns and replace in df
FactorDfNumeric = function(df) {
	
  m = dim(df)[2]
	outDf = NULL
	ind = NULL
	count = 0
	
  for (i in 1:m) {
		x = df[ , i]
		if (class(x) == 'factor') {
			name = colnames(df)[i]
			colnames(df)[i] = paste(name, '_', sep='')
			name = colnames(df)[i]
			ind = c(ind, i)
			#print(class(x))
			string = paste('xTrans = model.matrix( ~ ', name, '-1, data=df)', sep='')
			#print(string)
			eval(parse(text=string))
			xTrans = data.frame(xTrans)
			xTrans2 = xTrans[ , -1, drop=FALSE]
			if (count == 0) {
				outDf = xTrans2
			} else {
				outDf = cbind(outDf, xTrans2)
			}
			count = count + 1
		}
	}
	df2 = df
	if (length(ind) > 0) {
		df2 = df2[ , -ind, drop=FALSE]
		df2 = cbind(df2, outDf)
	}
	return(df2)
}

Example = function() {
	df = data.frame(x=1:10, y=c(rep('a', 5), rep('b', 5)),
		z=c(rep('c', 5), rep('d', 5)))
	FactorDfNumeric(df)
}

Example = function() {
	dates = c("2011-01-01 ", "2011-01-01", "2011-01-01")
	hrs = c('0', '1', '21')
	df = data.frame(date=dates, hr=hrs)

	BuildDatetime(df=df, dateCol='date', hrCol='hr')
}

### Fourier series
Fseries = function(data, colName, omega=2*pi, ord=1) {
	
  asc = as.character
	n = dim(data)[1]
	outDf = matrix(NA, n, 2*ord)
	outDf = data.frame(outDf)
	x = data[ , colName]
	columns = list()
	ind = 0 
	
  for (k in 1:ord) {
		sincoln = paste('sin', asc(k), '_', colName, sep='')
		coscoln = paste('cos', asc(k), '_', colName, sep='')
		columns = append(columns, sincoln)
		columns = append(columns, coscoln)
		u = omega*k*x
		sin = sin(u)
		cos = cos(u)
		ind = ind + 1
		outDf[ , ind] = sin
		names(outDf)[ind] = sincoln
		ind = ind + 1
		outDf[ , ind] = cos
		names(outDf)[ind] = coscoln
	}
	return(list('df'=outDf, 'columns'=columns)) 
}    
		   
Example = function() {
	
  x = seq(0, 2, 0.01)
	data = data.frame('x'=x)
	df = Fseries(data, colName='x') 
	
	cols = df[['columns']]
	df = df[['df']]
	plot(x, df[ , 2])
}

### multiple F series
FseriesMulti = function(data, colNames, omegas=NA, ords=NA) {
	
  k = length(colNames)
	if (is.na(omegas[1])) {
		omegas = rep(2*pi, length(colNames))
	}
	if (is.na(ords[1])) {
		ords = rep(1, length(colNames))
	}
	outData = NULL
	outColumns = list()
	
	for (i in 1:k) {
		colName = colNames[i]
		omega = omegas[i]
		ord = ords[i]
		FS = Fseries(data=data, colName=colName, omega=omega, ord=ord)
		FSdf = FS[['df']]
		FScolumns = FS[['columns']]
		if (i == 1) {
			outData=FSdf
		} else {
			outData = cbind(outData, FSdf)
		}

		outColumns = append(outColumns, FScolumns)
	}
	return(list('df'=outData, 'columns'=outColumns))        
}                 

Example = function() {
  
	x = seq(0, 2, 0.01)
	y = 2*seq(0, 2, 0.01)
	z = 4*x
	data = data.frame('x'=x, 'y'=y, 'z'=z)
	colNames = colnames(data)
	
	res = FseriesMulti(data, colNames=c('x', 'y', 'z'), omegas=NA, ord=NA) 
	df = res[['df']]
	colNames = res['columns']
	 
	plot(df[ , 1] ,df[ , 2])
	plot(x, df[ , 1])
}

### create an interaction matrix using two explan matrices: explan1, explan2
### the interactions needed are taken from interactList
### every tuple in interact list starts with an index in explan1
### then it follows by indices in interact2
CreateInteractFcn = function(
    interactList=NULL,
    interactMatrix=NULL,
    sep='_') {
	
  #print(interactList)
	#print(interactMatrix)
	if (is.null(interactList)) {
		x = interactMatrix
		interactList = split(t(x), rep(1:ncol(t(x)), each=nrow(t(x))))
	}
	print(interactList)
	
	Func = function(explan1, explan2) {
		k = length(interactList)
		k1 = dim(explan1)[2]
		k2 = dim(explan2)[2]

		n1 = dim(explan1)[1]
		n2 = dim(explan2)[1]
		if (n1 != n2) {
			print('explan1 and explan2 row nums do not match')
		}
		explanInteract = NULL
		for (i in 1:k) {
			index = interactList[[i]]
			explan1_index = index[1]
			explan2_index = index[-1]
			for (j in explan2_index) {
				explanInteract = cbind(explanInteract, 
					(explan1[ , explan1_index]*explan2[ , j]))
			}
		}

		explanInteract = data.frame(explanInteract)

		count = 0
		for (i in 1:k) {
			index = interactList[[i]]
			explan1_index = index[1]
			explan2_index = index[-1]
			if (!is.null(explan2_index)) {
				for (j in explan2_index) {
					count = count + 1
					name = paste('int', explan1_index, j, sep=sep)
					name1 = names(explan1)[explan1_index]
					name2 = names(explan2)[j]
					#print(name1);print(j);print(names(explan2));print(name2)
					if (!is.null(name1) & !is.null(name2)) {
						name=paste(name, sep, name1, sep, name2, sep='')
					}
					names(explanInteract)[count] = name
				}
			}
		}
		return(explanInteract)
	}
	return(Func)
}

Example = function() {
	
  explan1 = matrix(1:9, 3, 3)
	explan2 = matrix(11:19, 3, 3)
	interactList = list(c(1, 1, 2), c(2, 1, 2))
	fcn = CreateInteractFcn(interactList)
	fcn(explan1, explan2)
	explan1 = matrix(1:9, 3, 3)
	explan2 = matrix(11:19, 3, 3)
	explan1 = data.frame(explan1)
	explan2 = data.frame(explan2)
	names(explan1) = c('a', 'b', 'c')
	names(explan2) = c('t', 'u', 'v')

	interactList = list(c(3, 1, 2, 3), c(1, 2))
	CreateInteractFcn(interactList)(explan1, explan2)
	interactMatrix = rbind(c(1, 2), c(2, 3), c(1, 1))
	CreateInteractFcn(interactMatrix=interactMatrix)(explan1, explan2)
}

### randomForest prediction 
RfFit = function(y, explan) {
	
  InstallLoad('randomForest')
	mod = randomForest(y ~ ., data=explan)
	return(mod)

}

Example = function() {
	n = 100
	explan = matrix(rnorm(n*10), n, 10)
	explan = data.frame(explan)
	yObs = 5*explan[ , 1] + explan[ ,2]+rnorm(n)
	mod = RfFit(yObs, explan)
}    


## fitting via aggregate
AggFit = function(
	  y,
    explan,
    yName='value',
    wrtCols=NULL, 
	  AggFunc=mean,
    aggrFuncName='mean') {
	  explan = data.frame(explan)
	
	if (length(wrtCols) == 0) {
		wrtCols=colnames(explan)
	}
	
	data = cbind(y,explan)
	colnames(data)[1] = yName

	aggModDf = SummWrt(
		  data,
      valueCol=yName,
      wrtCols=wrtCols,
      probs=NULL, 
		  fcnList=list(AggFunc),
      fcnName='mean') 
	
  aggMod = list(df=aggModDf)
	class(aggMod) = 'aggMod'
	return(aggMod)
}

AggPred = function(aggMod, explan) {
	colNames = colnames(aggMod$df)[-length(colnames(aggMod$df))]
	yName = colnames(aggMod$df)[length(colnames(aggMod$df))]
	#print(dim(explan))
	explan[ , 'orderTemp'] = 1:dim(explan)[1]
	out = merge(explan, aggMod[['df']], by=colNames, all.x=TRUE)
	out = out[order(out[ , 'orderTemp']), ]
	yPred = out[ , yName]
	return(yPred)
}

predict.aggMod = function(aggMod, explan, ...) {
	return(AggPred(aggMod, explan))
}

print.aggMod = function(aggMod) {
	print(aggMod[['df']])
}

summary.aggMod = function(aggMod) {
	return(aggMod[['df']])
}

## remove Agg Value such as mean, median
RemoveAggValue = function(
	data,
  colName,
  wrtCols=NULL, 
	AggFunc=mean,
  aggrFuncName='mean',
	addToData=TRUE,
  newColName=NULL) {
	
	aggMod = AggFit(y=data[ , colName], explan=data, yName=colName, 
		wrtCols=wrtCols, AggFunc=AggFunc, aggrFuncName=aggrFuncName) 
	yPred = AggPred(aggMod, explan=data)
	out = data[ , colName] - yPred
	if (addToData) {
		out = cbind(data, out)
		if (is.null(newColName)) {
			newColName = paste(colName, '_', aggrFuncName, '_removed', sep='')
		}
		colnames(out)[dim(out)[2]] = newColName
	}
	return(out)
}

Example = function() {
	n = 50
	u1 = sample(1:3, n, replace=TRUE)
	x1 = factor(u1, labels = c("p1","p2","p3"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels = c("rich", "poor"))
	y = (
		0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) +
		20*(u1==5) + 1*(u2==1) + rnorm(n))
	explan = data.frame(x1=x1, x2=x2)
	data = cbind(y,explan)
	RemoveAggValue(data,colName='y', wrtCols=c('x1', 'x2'), AggFunc=mean, 
		aggrFuncName='mean', addToData=TRUE)
}

###### imputation
### impute col
ImputeCol = function(
    df,
    valueCol,
    wrtCols,
    AggFunc=NULL,
    aggrFuncName="agg") {
	
  if (is.null(AggFunc)) {
		AggFunc = function(x){mean(x, na.rm=TRUE)}
	}
	
  y = df[ , valueCol]
	
  aggMod = AggFit(
      y=y,
      explan=df,
      yName=valueCol,
		  wrtCols=wrtCols,
      AggFunc=AggFunc,
      aggrFuncName=aggrFuncName) 
	
  yPred = AggPred(aggMod, explan=df)
	
	ind = which(is.na(y))
	out = y
	out[ind] = yPred[ind]
	return(out)
}

Example = function() {
	n = 50
	u1 = sample(1:3, n, replace=TRUE)
	x1 = factor(u1, labels = c("p1", "p2", "p3"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels = c("rich", "poor"))
	y = 0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) + 20*(u1==5) + 1*(u2==1) + rnorm(n)
	y[1:5] = NA
	explan = data.frame(x1=x1, x2=x2)
	data = cbind(y, explan)
	ImputeCol(df=df, valueCol='y', wrtCols=c('x1', 'x2'), AggFunc=NULL)
}

### impute data
ImputeDf = function(
  	df,
	  valueCols,
	  wrtCols,
	  AggFunc=NULL,
	  replaceCol=TRUE) {
	
	out = NULL
	for (i in 1:length(valueCols)) {
		valueCol = valueCols[i]
		
    newCol = ImputeCol(
        df=df,
        valueCol=valueCol,
        wrtCols=wrtCols,
        AggFunc=NULL)
		
    out = cbind(out, newCol)
		colnames(out)[i] = valueCol
		if (replaceCol) {
			df[ , valueCol] = newCol
		} 
	}
	
	if (replaceCol) {
		out = df
	}

	return(out)

}

Example = function() {
	n = 50
	u1 = sample(1:3, n, replace=TRUE)
	x1 = factor(u1, labels = c("p1","p2","p3"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels = c("rich", "poor"))
	y1 = (
      0*(u1 == 1) +
      8*(u1 == 2) +
      10*(u1 == 3) +
      15*(u1 == 4) +
      20*(u1 == 5) +
      1*(u2 == 1) + rnorm(n))

	y2 = (
      0*(u1 == 1) +
      3*(u1 == 2) +
      1*(u1 == 3) +
      5*(u1 == 4) +
      25*(u1 == 5) +
      1*(u2==1) + rnorm(n))
	
  y1[1:5] = NA
	y2[5:10] = NA
	explan = data.frame(x1=x1, x2=x2)
	df = cbind(y1, y2, explan)
	ImputeDf(df=df, valueCols=c('y1', 'y2'), wrtCols=c('x1', 'x2'))
}

#setMethod(predict, aggMod, .aggMod)
Example = function() {
	n = 500
	u1 = sample(1:5, n, replace=TRUE)
	x1 = factor(u1, labels = c("p1", "p2", "p3", "p4", "p5"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels = c("rich", "poor"))
	y = (
      0*(u1 == 1) +
      8*(u1 == 2) +
      10*(u1 == 3) +
      15*(u1 == 4) +
      20*(u1 == 5) +
      1*(u2 == 1) + rnorm(n))

	explan = data.frame(x1=x1,x2=x2)
	aggMod = AggFit(y, explan)
	yPred = AggPred(explan, aggMod)
	plot(y, yPred)
	yPred = predict(aggMod, explan)
}

Example = function() {
	n = 500
	u1 = sample(1:5, n, replace=TRUE)
	x1 = factor(u1, labels = c("p1", "p2", "p3", "p4", "p5"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels = c("rich", "poor"))
	 y = (
      0*(u1 == 1) +
      8*(u1 == 2) +
      10*(u1 == 3) +
      15*(u1 == 4) +
      20*(u1 == 5) +
      1*(u2 == 1) + rnorm(n))

	explan = data.frame(x1=x1, x2=x2)
	aggMod = AggFit(y, explan)
	yPred = predict(aggMod, explan)
	plot(y, yPred)

	FitAssess(yObs=y, explan=explan, model='Agg', wrtCols=NULL)
}

## fit dev models
DevModelFit = function(
  	y,
	  explanBasic,
	  explanDev,
	  basicModel,
	  devModel, 
	  devFcn=NA,
	  devInvFcn=NA,
	  form='additive',
	  wrtCols=NULL,
	  family='gaussian') {
	
	if (!is.null(form) & form == 'additive') {
		devFcn = function(x, y)(x - y)
		devInvFcn = function(x, y)(x + y)
	}
	
	if (!is.na(form) & form=='multi') {
		devFcn = function(x, y)(x/y)
		devInvFcn = function(x, y)(x*y)
	}
	
	bmod = basicModel(y, explan=explanBasic)
	yFitBasic = predict(bmod, explan=explanBasic, wrtCols=wrtCols)
	e = devFcn(y, yFitBasic)
	dmod = devModel(e, explanDev)
	eFit = predict(dmod, explanDev)
	out = list(bmod=bmod, dmod=dmod, devFcn=devFcn, devInvFcn=devInvFcn)
	class(out) = 'devMod'
	return(out)
}

print.devMod = function(devMod) {
	bmod = devMod[['bmod']]
	dmod = devMod[['dmod']]
	print('basic model')
	print(bmod)
	print('dev model')
	print(dmod)
}

predict.devMod = function(devMod, explanBasic, explanDev, ...) {
	bmod = devMod[['bmod']]
	dmod = devMod[['dmod']]
	devFcn = devMod[['devFcn']]
	devInvFcn = devMod[['devInvFcn']]
	yPredBasic = predict(bmod, explanBasic)
	ePred = predict(dmod, explanDev)
	yPred = devInvFcn(yPredBasic, ePred) 
	return(yPred)
}

Example = function() {
	n = 500
	u1 = sample(1:5, n, replace=TRUE)
	x1 = factor(u1, labels = c("p1", "p2", "p3", "p4", "p5"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels = c("rich", "poor"))
	x3 = rnorm(500)
	x4 = rnorm(500)
	y = 0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) + 20*(u1==5) + 1*(u2==1) + 
		2*x3 + 3*x4 + rnorm(n)

	explanBasic = data.frame(x1=x1, x2=x2)
	explanDev = data.frame(x3, x4)

	# glm
	devMod = DevModelFit(
      y, explanBasic=explanBasic, explanDev=explanDev,
		  basicModel=AggFit, devModel=GlmFitMatr, devFcn=NA,
      devInvFcn=NA, form='additive')

	# RF
	devMod = DevModelFit(
      y, explanBasic=explanBasic, explanDev=explanDev,
		  basicModel=AggFit, devModel=RfFit, devFcn=NA,
      devInvFcn=NA, form='additive')

	yFit = predict(
      devMod, explanBasic=explanBasic, explanDev=explanDev)
	
  plot(y, yFit)
	print(devMod)
	
	gmod = GlmFitMatr(y, explan)
	yFit = predict(gmod, explan=explan)
	plot(y, yFit)
}

## Fitting various algorithms in R
FitAssess = function(
    yObs,
    explan=NULL,
    model, 
  	indTrain=NULL,
    indValid=NULL,
    obsDamping=TRUE,
  	ymin=NULL,
    ymax=NULL,
    wrtCols=NULL,
    explanBasic=NULL, 
  	explanDev=NULL,
    basicModel=NULL, 
  	devModel=NULL,
    devForm='additive',
    family='gaussian', 
  	larsType='lasso',
    divideColNames=NULL, 
  	divideProp=1/2,
    explanDivide=NULL,
  	divideRangeCols=NULL,
    divideRangeList=NULL) {
	
	n = length(yObs)
	divideOrder = 1:n
 	
	if (!is.null(divideColNames)) {
		data = cbind(divideOrder, explanDivide)
		colnames(data)[1] = 'divideOrder'
		res = DivideDataWrt(df=data, prop=divideProp,
			colNames=divideColNames)
		dataTrain = res[[1]]
		dataValid = res[[2]]
		indTrain = dataTrain[ , 'divideOrder']
		indValid = dataValid[ , 'divideOrder']
	} 
	
	if (!is.null(divideRangeCols)) {
		logicalInd = rep(1, n)
		for (i in 1:length(divideRangeCols)) {
			colName = divideRangeCols[i]
			x = explanDivide[ , colName]
			colRange = divideRangeList[[i]]
			logicalInd = (x < colRange[2] & x > colRange[1])*logicalInd   
		}
		indTrain = which(logicalInd>0)
		indValid = setdiff(1:n, indTrain)
	}
	
	
	if (is.null(indTrain)) {
		m = round(n/2)
		indTrain = sample(1:n, m)
	}
	
	if (is.null(indValid)) {
		indValid = setdiff(1:n, indTrain)
	}
	
	yTrain = yObs[indTrain]
	yValid = yObs[indValid] 
	explanTrain = explan[indTrain, , drop=FALSE]
	explanValid = explan[indValid, , drop=FALSE]
	
	yminObs = min(yTrain)
	ymaxObs = max(yTrain)
	
	if (model == 'RF') {
		InstallLoad('randomForest')
		mod = RfFit(y=yTrain, explan=explanTrain)
		yFit = predict(mod, newdata=explanTrain)
		yPred = predict(mod, newdata=explanValid)
	}
	
	if (model == 'GLM') {
		mod = GlmFitMatr(y=yTrain, explan=explanTrain, family=family)
		yFit = predict(mod, explan=explanTrain)
		yPred = predict(mod, explan=explanValid)
	}
	
	if (model == 'LARS') {
		InstallIf('lars')
		library('lars')
		xTrain  = FactorDfNumeric(explanTrain)
		xTrain = as.matrix(xTrain)
		p = dim(xTrain)[2]
		mod = lars(x=xTrain, y=t(yTrain), type=larsType)
		res = predict.lars(mod, newx=xTrain, type='fit')
		fit = res[["fit"]]
		yFit = fit[,(p+1)]
		xValid = FactorDfNumeric(explanValid)
		xValid = as.matrix(xValid)
		res = predict.lars(mod, newx=xValid)
		pred = res$fit
		yPred = pred[ , (p+1)]
	}
		
	if (model == 'PLS') {
		InstallIf('pls')
		library('pls')
		ncomp=10
		#X1 = explan_all
		xTrain = as.matrix(explan[indTrain, ])
		#PLSmod = mvr(y1~X1,  ncomp=6 , data = mydata)
		mod = plsr(yTrain ~ xTrain, ncomp=ncomp)
		fit = predict(mod, newdata=xTrain)
		yFit = fit[ , , ncomp]
		yFit = MaxComp(yFit, 0)
		xValid = explan[indValid, ]
		xValid = as.matrix(xValid)
		pred = predict(mod, newdata=xValid)
		yPred = pred[ , , ncomp]
	}
	
	
	if (model == 'Agg') {
		mod = AggFit(y=yTrain, explan=explanTrain, wrtCols=wrtCols)
		yFit = AggPred(mod, explan=explanTrain)
		yPred = AggPred(mod, explan=explanValid)
	}
	
	if (model == 'Dev') {
		if (is.null(explanBasic)) {explanBasic=explan}
		if (is.null(explanDev)) {explanDev=explan}
		explanBasicTrain = explanBasic[indTrain, ]
		explanBasicValid = explanBasic[indValid, ]
		explanDevTrain = explanDev[indTrain, ]
		explanDevValid = explanDev[indValid, ]
		mod = DevModelFit(
			  yTrain, explanBasic=explanBasicTrain, explanDev=explanDevTrain,
			  basicModel=basicModel, devModel=devModel, devFcn=NA, devInvFcn=NA,
			form=devForm, wrtCols=wrtCols)
		
    yFit = predict(mod, explanBasic=explanBasicTrain, explanDev=explanDevTrain)
		yPred = predict(mod, explanBasic=explanBasicValid, explanDev=explanDevValid)
	}
	
	summary(mod)
	
	if (obsDamping) {
		yFit = MaxComp(yFit, yminObs)
		yFit = MinComp(yFit, ymaxObs)
	}
	if (obsDamping) {
		yPred = MaxComp(yPred, yminObs)
		yPred = MinComp(yPred, ymaxObs)
	}
	
	if (!is.null(ymin)) {
		yFit = MaxComp(yFit, ymin)
		yPred = MaxComp(yPred, ymin)
	}
	if (!is.null(ymax)) {
		yFit = MinComp(yFit, ymax)
		yPred = MaxComp(yPred, ymin)
	}
	
	indOK = which(!is.na(yPred))
	num = length(yPred) - length(indOK)
	if (num > 0) {
		print('*** Warning: this number of predictions are NA:')
		print(num)
	}
	
	par(mfrow=c(1, 2))
	plot(yTrain, yFit, col=rgb(0,  0, 0, alpha=0.1), pch=20)
	print('fit cor and rmse')
	print(cor(yTrain, yFit))
	abline(0, 1, col='blue')
	rmse = sqrt(mean((yTrain-yFit)^2))
	print(rmse)
	plot(yValid,yPred,col=rgb(0, 0, 0, alpha=0.1), pch=20)
	abline(0, 1, col='blue')
	print('test cor and rmse')
	print(cor(yValid[indOK], yPred[indOK]))
	rmse = sqrt(mean((yValid[indOK] - yPred[indOK])^2))
	print(rmse)
	return(mod)
}

Example = function() {
	### TRY GLM
	n = 100
	explan = matrix(rnorm(n*10), n, 10)
	explan = data.frame(explan)
	yObs = 5*explan[ , 1] + explan[ , 2] + rnorm(n)
	mod = FitAssess(yObs, explan, model='GLM')
	
	### TRY LARS
	n = 100
	explan = matrix(rnorm(n*10), n, 10)
	explan = data.frame(explan)
	yObs = 5*explan[ , 1] + explan[ , 2] + rnorm(n)
	mod = FitAssess(yObs, explan, model='LARS')
	
	### TRY Random Forest
	n = 100
	explan = matrix(rnorm(n*10), n, 10)
	explan = data.frame(explan)
	yObs = 5*explan[ , 1] + explan[ , 2] + rnorm(n)
	mod = FitAssess(yObs, explan, model='RF')
	
   
  ### TRY Dev models
	n = 400
	u1 = sample(1:3, n, replace=TRUE)
	x1 = factor(u1, labels=c("p1", "p2", "p3"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels=c("rich", "poor"))
	x3 = rnorm(n)
	x4 = rnorm(n)
	yObs = 0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) + 20*(u1==5) + 1*(u2==1) + 
		2*x3 + 3*x4 + 4*x3*(x4>1) + 7*(x3>2)*x4 + 5*cos(2*pi*x3) * rnorm(n)

	explanBasic = data.frame(x1=x1, x2=x2)
	explanDev = data.frame(x3, x4)
	explan = cbind(explanBasic, explanDev)
	
	mod = FitAssess(yObs, explanBasic=explanBasic, explanDev=explanDev,
		model='Dev', basicModel=AggFit, devModel=GlmFitMatr)
	
	mod = FitAssess(yObs, explanBasic=explanBasic, explanDev=explanDev,
		model='Dev', basicModel=AggFit, devModel=RfFit)
	
	## divide data wrt specific columns
	mod = FitAssess(yObs, explan=cbind(explanBasic, explanDev), model='GLM')
	
	
	mod = FitAssess(yObs, explan=cbind(explanDev), model='GLM',
		divideColNames=c('x1', 'x2'), divideProp=1/2, explanDivide=explanBasic)
	
	mod = FitAssess(yObs, explan=cbind(explanDev), model='GLM',
		explanDivide=explan[ , c('x3', 'x4')], divideRangeCols=c('x3', 'x4'), 
		divideRangeList=list(c(0, 1), c(0, 2)))
	
	
	dig_deeper = function() {
		basicModel = AggFit
		devModel = GlmFitMatr
		indTrain = 1:250
		indValid = 250:500
		explanBasicTrain = explanBasic[indTrain, ]
		explanBasicValid = explanBasic[indValid, ]
		explanDevTrain = explanDev[indTrain, ]
		explanDevValid = explanDev[indValid, ]
		mod = DevModelFit(
			yTrain, explanBasic=explanBasicTrain, explanDev=explanDevTrain,
			basicModel=basicModel, devModel=devModel, devFcn=NA, devInvFcn=NA,
			form=devForm,
			wrtCols=wrtCols)
		yFit = predict(mod, explanBasic=explanBasicTrain, explanDev=explanDevTrain)
		yPred = predict(mod, explanBasic=explanBasicValid, explanDev=explanDevValid)
	}
}

### categorical
FitAssessCat = function(
    y, explan, model, indTrain=NA, indValid=NA) {
	
  n = dim(explan)[1]
	if (is.na(sum(indTrain))) {
		m=round(n/2)
	}
	
	indTrain = sample(1:n, m)
	indValid = setdiff(1:n, indTrain)
	yTrain = y[indTrain]
	yValid = y[indValid] 
	explanTrain = explan[indTrain, ]
	explanValid = explan[indValid, ]
	
	if (model == 'RF') {
		mod = RfFit(y=yTrain, explan=explanTrain)
		summary(mod)
		yFit = predict(mod)
		yPred = predict(mod, newdata=explanValid)
	}
	
	
	if (model == 'nnet') {
		library('nnet')
		size = NA
			# scale inputs: divide by 50 to get 0-1 range
		mod = nnet(yTrain ~ ., data=X1, size=size) 
		 # multiply 50 to restore original scale
		fit = predict(mod)
		L = length(yTrain)
		yFit = rep(NA, L)
		NAMES = levels(yTrain)
		for (i in 1:L) {
			ind = which(fit[i, ] == max(fit[i, ]))[1]
			yFit[i] = NAMES[ind]
		}

		pred = predict(mod, newdata=xValid)
		L = length(yValid)
		yPred = rep(NA, L)
		NAMES = levels(yValid)
		for (i in 1:L) {
			ind = which(pred[i, ]==max(pred[i, ]))[1]
			yPred[i] = NAMES[ind]
		}
	}
	
	
	if (model == 'RSNNS') {
		library('RSNNS')
		mod = mlp(x=xTrain, y=yTrain)
		yFit = predict(mod)
		yPred = predict(mod, newdata=xValid)
	}
	
	if (model == 'multiLogistic') {
		mod = multinom(formula=yTrain ~ ., data=xTrain)
		yFit = predict(mod)
		yPred = predict(mod, data=xValid)
		res = categPredErr(yTrain, yFit)
	}
	
	if (model == 'SVM') {
		library('e1071')
		mod = svm(y=yTrain, x=xTrain)
		yFit = predict(mod)
		yPred = predict(mod, newdata=xValid)
	}
	
	par(mfrow=c(1, 2))
	plot(yTrain, yFit)
	plot(yValid, yPred)
	 
	res = categPredErr(yTrain, yFit)
	print('Fit errors')
	print(res)
	res = categPredErr(yValid, yPred)
	print('Pred Errors')
	return(mod)
}

### model matrix
## gets a list, for a list element which is a vector of variables build interactions
## using DesignMatFcn below should be better
ModelMatrixViaList = function(formulaList, data, dropIntercept=TRUE) {
	colsToText = function(colNames) {
		for (k in 1:length(colNames)) 
		{
			if (k == 1) {
				out = colNames[1]
			} else {
				out = paste(out, '*', colNames[k], sep='')
			}
		}
		 return(out)
	}

	formulaText = '~'
	n = length(formulaList)
	for (i in 1:n) {
		colNames = formulaList[[i]]
		print(colNames)
		Text = colsToText(colNames)
		print(Text)
		if (i == 1) {
			formulaText = paste(formulaText, Text, sep='')
		}
		if (i > 1) {
			formulaText = paste(formulaText,'+', Text, sep='')
		}
	}
	print(formulaText)
	
	runText = paste('outData = model.matrix(', formulaText, ', data=data)', sep='')
	eval(parse(text=runText))
	if (dropIntercept) {outData = outData[ , -1 , drop=FALSE]}
	return(outData)
}

Example = function() {

   ### TRY Dev models
	n = 50
	u1 = sample(1:3, n, replace=TRUE)
	x1 = factor(u1,labels=c("p1","p2","p3"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels=c("rich", "poor"))
	x3 = rnorm(n)
	x4 = rnorm(n)
	yObs = 0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) + 20*(u1==5) + 1*(u2==1) + 
		2*x3 + 3*x4 + 4*x3*(x4>1) + 7*(x3>2)*x4 + 5*cos(2*pi*x3) * rnorm(n)

	data = data.frame(x1=x1, x2=x2, x3=x3, x4=x4)
	formulaList = list('x1', c('x1', 'x2'))
	
	formulaList = list('x1')
	res = ModelMatrixViaList(formulaList, data)

	formulaList = list('x1', 'x2')
	ModelMatrixViaList(formulaList, data)


	formulaList = list('x2')
	ModelMatrixViaList(formulaList, data)
	
	formulaList = list('x1*x2')
	ModelMatrixViaList(formulaList, data)
	
	formulaList = list('x1+x1*x2')
	ModelMatrixViaList(formulaList, data)
	
	formulaList = list('x1+x2+x1*x2')
	ModelMatrixViaList(formulaList, data)
}

Example = function() {
	### TRY Dev models
	n = 50
	x1 = rnorm(n)
	x2 = rnorm(n)
	x3 = rnorm(n)
	x4 = rnorm(n)
	
	data = data.frame(x1=x1, x2=x2, x3=x3, x4=x4)
	formulaList = list('x1', c('x1', 'x2'))
	
	formulaList = list('x1')
	ModelMatrixViaList(formulaList, data)

	formulaList = list('x1', 'x2')
	ModelMatrixViaList(formulaList, data)


	formulaList = list('x2')
	ModelMatrixViaList(formulaList, data)
	
	formulaList = list('x1*x2')
	ModelMatrixViaList(formulaList, data)
	
	formulaList = list('x1+x1*x2')
	ModelMatrixViaList(formulaList, data)
	
	formulaList = list('x1+x2+x1*x2')
	ModelMatrixViaList(formulaList, data)
}

### build a new function which removes NA
RemoveNAfcn = function(f) {
	G = function(x, ...) {
		f(x, na.rm=TRUE, ...)
	}
	return(G)
}

Example = function() {
	f = mean
	x = c(1:10, NA)
	f(x)
	RemoveNAfcn(f)(x)
}

## written first for uber excercise
## if the model is not glm then we need to get the design matrix (numerical matrix)
DesignMatFcn = function(df, formulaString, DfTrans=function(x){x}) {
	#print('DesignMatFcn: data')
	formula = as.formula(formulaString)
	## design matrix
	#print('DesignMatFcn: data')
	#print(data[1, ])

	
	## this builds the design matrix for current data 
	## as well as future data
	## for current data we pass nothing
	Fcn = function(newdata=NULL) {
		#print('DesignMatFcn: dim newdata'); print(dim(newdata))
		newdata = DfTrans(newdata)
		df2 = rbind(df, newdata)
		# print('dim data2'); print(dim(data2))
		m = model.frame(formula=formula, data=df2)
		x2 = model.matrix(formula, m)
		x2 = matrix(x2, dim(x2)[1], dim(x2)[2])
		#print(dim(x2))
		nData = dim(df)[1]
		nNewdata = dim(newdata)[1]
		#print('nData'); print(nData); print('nNewdata'); print(nNewdata); print('sum')
		#print(nData+nNewdata); print('dim x2')
		x = x2[(nData + 1):(nData + nNewdata), , drop=FALSE]
		return(x)
	}
	return(Fcn)
}

Example = function() {

	n = 100
	m = 50
	u1 = sample(1:3, n, replace=TRUE)
	x1 = factor(u1,labels=c("p1","p2","p3"))
	u2 = sample(1:2, n, replace=TRUE)
	x2 = factor(u2, labels=c("rich", "poor"))
	x3 = rnorm(n)
	x4 = rnorm(n)
	y = 0*(u1==1) + 8*(u1==2) + 10*(u1==3) + 15*(u1==4) + 20*(u1==5) + 1*(u2==1) + 
		2*x3 + 3*x4 + 4*x3*(x4>1) + 7*(x3>2)*x4 + 5*cos(2*pi*x3) * rnorm(n)

	data = data.frame(x1=x1, x2=x2, x3=x3, x4=x4, y=y)[1:m, ]
	newdata = data.frame(x1=x1, x2=x2, x3=x3, x4=x4, y=y)[(m+1):n, ]



	formulaText = 'y~x1+x2'
	formula = as.formula(formulaText)
	m = model.frame(formula=formula, data=data)
}
Reza1317/funcly documentation built on Feb. 5, 2020, 4:06 a.m.