R/GLoMo.r

Defines functions .hasNA .contDataAsMat rCatsInDfr rCatsAndCntInDfr GLoMo getGuidData randomFillAndRepeatDataRow reusableDataForGLoMoSampling predict.GLoMo updateGuidData validateFunction.acceptall validateFunction.useprob validateFunction.default predict.conditional predict.conditional.GLoMo predict.conditional.allrows.GLoMo combineGLoMos

Documented in combineGLoMos getGuidData GLoMo predict.conditional predict.conditional.allrows.GLoMo predict.conditional.GLoMo predict.GLoMo randomFillAndRepeatDataRow rCatsAndCntInDfr rCatsInDfr reusableDataForGLoMoSampling updateGuidData validateFunction.acceptall validateFunction.default validateFunction.useprob

#rcats will randomly (or if it is feasible, definedly) draw combinations
#of categories for the categorical variables, and note their relevance for
#each of the original rows.
#assumes dfr is wellformed, i.e.: only factors and numeric columns.
#also, all factors should be at the start of the columns (this may not be
#necessary anymore)

#note: see use of old version in full.glmnet.EM.fit
#note: see similar code in firstProcessingOfDfrForInitialFit.GLoMo

#known issues:
#need to check if there are no categorical columns
#if there are too many categorical columns, a subset of the possible combinations
#   is sampled, but the probabilities returned are the original ones
#a set of categorical values could be in the data.frame twice, if it can be 'attained'
#   from two original rows: note that this will then have different probabilities (!)
#Is it relevant to return the probability based on categoricals alone (probably yes)


 

#SECTION: non-exported helper functions




# .quickNumMatrix<-function(dfr) UseMethod(".quickNumMatrix")
# #need to check whether this is still any different from as.matrix.data.frame
# #in practice
# .quickNumMatrix.data.frame<-function(dfr){
# 	retval<-matrix(unlist(dfr), ncol=ncol(dfr))
# 	dimnames(retval)<-dimnames(dfr)
# 	return(retval)
# }
# .quickNumMatrix.numdfr<-function(dfr){return(as.matrix(dfr))}
# .quickNumMatrix.default<-function(dfr){return(as.matrix(dfr))}

# .matBack2OrgClass<-function(objWithClass, mat, catCols, levelList, colnms=NULL,
# 	verbosity=0) UseMethod(".matBack2OrgClass")
# 	
# .matBack2OrgClass.data.frame<-function(objWithClass, mat, catCols, levelList,
# 	colnms=NULL, verbosity=0)
# {
# 	.mat2dfr(mat=mat, catCols=catCols, levelList=levelList, colnms=colnms,
# 		verbosity=verbosity)
# }
# 
# .matBack2OrgClass.numdfr<-function(objWithClass, mat, catCols, levelList,
# 	colnms=NULL, verbosity=0)
# {
# 	posInCatCols<-match(seq(ncol(mat)), catCols, nomatch=0)
# 	allLevels<-lapply(posInCatCols, function(ccn){
# 			if(ccn > 0) return(levelList[[ccn]]) else return(character(0))
# 		})
# 	colnames(mat)<-colnms
# 	names(allLevels)<-colnms
# 	retval<-list(mat=mat, lvls=allLevels)
# 	class(retval)<-"numdfr"
# 	return(retval)
# }


.hasNA<-function(dfr)
{
	return(any(is.na(dfr)))
}

.contDataAsMat<-function(dfr){
	catcols<-findCatColNums(dfr)
	return(as.nummatrix(dfr[,-catcols]))
}



#END OF SECTION: non-exported helper functions






#optimization: best option is to look at mat2dfr... see comments there.
#   This is only relevant for data.frame...
rCatsInDfr<-function(dfr, maxFullNACatCols=6, howManyIfTooMany=1000,
	onlyCategorical=FALSE, weightsName="weights", orgriName="orgri",
	reweightPerRow=FALSE, verbosity=0,...)
{
	catCols<-findCatColNums(dfr)
	ordCols<-rep(FALSE, ncol(dfr))
	ordCols[findOrderedColNums(dfr)]<-TRUE
	dfrl<-dfr[,catCols, drop=FALSE]
	if(onlyCategorical)
	{
		dfr<-dfrl
		catCols<-seq(length(catCols))
	}
	orgnames<-colnames(dfr)
	catwif(verbosity>0, "find marginal probabilities")
	probs<-marginalProbPerCat(dfrl)#one item for every categorical column

	toAddCols<-NULL
	if((!is.null(weightsName)) && (nchar(weightsName)>0))
	{
		toAddCols<-c(toAddCols, weightsName)
	}
	if((!is.null(orgriName)) && (nchar(orgriName)>0))
	{
		toAddCols<-c(toAddCols, orgriName)
	}

	naLevels<-allLevels(dfrl)
	for(i in seq_along(probs))
	{
		unusedLevels<-(probs[[i]] == 0)
		if(sum(unusedLevels) > 0)
		{
			catwif(verbosity > 0, "Unused levels detected in column", names(naLevels)[i], ".")
			catwif(verbosity > 0, "Unused levels are:")
			printif(verbosity > 0, naLevels[[i]][unusedLevels])
			catwif(verbosity > 0, "Better to remove these up front; will try to ignore them now.")
			naLevels[[i]]<-naLevels[[i]][!unusedLevels]
			probs[[i]]<-probs[[i]][!unusedLevels]
		}
	}
	dfr<-as.nummatrix(dfr)
	catwif(verbosity>0, "dfr is now a matrix of dimension:", dim(dfr), "and class",
		class(dfr))
	catwif(verbosity>0, "while dfrl now has class", class(dfrl), "and dimension:",
		dim(dfrl))
	#so, from here on, dfr is a matrix!!! while dfrl only holds the categorical 
	#   cols but still has the same class as dfr originally had!
	
	naLevelNums<-lapply(naLevels, function(curlvls){seq(length(curlvls))})
	catwif(verbosity>0, "start producing new rows")
	newrows<-lapply(seq(nrow(dfrl)), function(ri)
	{
		catwif(verbosity>1, "row", ri, "/", nrow(dfrl))
		currow<-dfrl[ri,,drop=TRUE]
		curnas<-which(is.na(currow))
		if(length(curnas) > maxFullNACatCols)
		{
			catwif(verbosity>1, "too many categoricals missing")
			#if these are 3-level categories, this means already >= 2187 possible 
			#combinations in this case, we draw howManyIfTooMany random ones from the
			#marginal multinomials
			stopifnot(howManyIfTooMany > 0)
			if(is.null(weightsName) || (nchar(weightsName)==0))
			{
				catwif(verbosity>1, "no need to calculate weights")
				catvals<-sapply(curnas, function(ci){
						sample.int(length(naLevels[[ci]]), howManyIfTooMany, replace=TRUE,
							prob=probs[[ci]])
					})
				if(length(curnas)==1)
				{
					catvals<-matrix(catvals, ncol=1)
				}
				else if(howManyIfTooMany==1)
				{
					catvals<-matrix(catvals, nrow=1)
				}
				catCombProbs<-rep(1, howManyIfTooMany)
			}
			else
			{
				catwif(verbosity>1, "needed to calculate weights")
				catAll<-lapply(curnas, function(ci){
						rv<-sample.int(length(naLevels[[ci]]), howManyIfTooMany,
							replace=TRUE, prob=probs[[ci]])
						relvpr<-probs[[ci]][rv]
						return(list(val=rv, pr=relvpr))
					})
				catvals<-sapply(catAll, '[[', "val")
				catMargprobs<-sapply(catAll, '[[', "pr")
				if(length(curnas)==1)
				{
					catvals<-matrix(catvals, ncol=1)
					catMargprobs<-matrix(catMargprobs, ncol=1)
				}
				else if(howManyIfTooMany==1)
				{
					catvals<-matrix(catvals, nrow=1)
					catMargprobs<-matrix(catMargprobs, nrow=1)
				}
				catCombProbs<-unlist(apply(catMargprobs, 1, prod))
				if(reweightPerRow)
				{
					catCombProbs<-catCombProbs/sum(catCombProbs)
					#make the weights sum to 1
				}
			}
		}
		else if(length(curnas) > 0)
		{
			catwif(verbosity>1, "few categoricals missing")
			catvals<-dfr2mat(expand.grid(naLevelNums[curnas]))
			if((!is.null(weightsName)) && (nchar(weightsName)>0))
			{
				catwif(verbosity>1, "needed to calculate weights")
				catMargprobs<-expand.grid(probs[curnas])
				catwif(verbosity>1, "will generate", nrow(catMargprobs), "observations")
				catCombProbs<-apply(catMargprobs, 1, prod)
			}
		}
		if(length(curnas) > 0)
		{
			retval<-dfr[rep(ri, length(catCombProbs)),, drop=FALSE]
			retval[,catCols[curnas]]<-catvals
		}
		else
		{
			catwif(verbosity>1, "no categoricals missing")
			catCombProbs<-1
			retval<-dfr[ri,, drop=FALSE]
		}

		if(((!is.null(weightsName)) && (nchar(weightsName)>0)) ||
			((!is.null(orgriName)) && (nchar(orgriName)>0)))
		{
			toAdd<-rep(ri, length(catCombProbs))
			catwif(verbosity>1, "need to add either weight or orgri")
			if((!is.null(weightsName)) && (nchar(weightsName)>0))
			{
				catwif(verbosity>1, "need to add weight")
				toAdd<-catCombProbs
				if((!is.null(orgriName)) && (nchar(orgriName)>0))
				{
					catwif(verbosity>1, "need to add orgri too")
					toAdd<-cbind(toAdd, ri)
				}
			}
			else
			{
				#now we know ((!is.null(orgriName)) && (nchar(orgriName)>0))
				catwif(verbosity>1, "need to add orgri")
				toAdd<-ri
			}
			retval<-cbind(retval, toAdd)
			colnames(retval)<-c(orgnames, toAddCols)
		}
		return(retval)
	})
	catwif(verbosity>1, "combine the rows")
	resmat<-do.call(rbind, newrows)
	
	catwif(verbosity>1, "turn resulting matrix into ", class(dfrl), " again")
	result<-matBack2OrgClass(dfrl, mat=resmat, catCols=catCols, levelList=naLevels,
		ord=c(ordCols, rep(FALSE, length(toAddCols))), colnms=c(orgnames, toAddCols), verbosity=verbosity-1)
	return(result)
}

rCatsAndCntInDfr<-function(dfr, maxFullNACatCols=6, howManyIfTooMany=1000,
	weightsName="weights", orgriName="orgri", reweightPerRow=FALSE, verbosity=0,...)
{
	catCols<-findCatColNums(dfr)
	numCols<-ncol(dfr)
	contCols<-(seq(numCols))[-catCols]

	#we expect that the original dfr will be smaller, so we fill the continuous 
	#columns first, since they are filled with the mean anyway
	cln<-0
	for(i in contCols)
	{
		cln<-cln+1
		catwif(verbosity>0, "continuous column (", i, "):", cln, "/",
			length(contCols))
		curcol<-dfr[,i,drop=TRUE]
		wherenas<-which(is.na(curcol))
		if(length(wherenas) > 0)
		{
			colm<-mean(curcol, na.rm=TRUE)
			dfr[wherenas,i]<-colm
		}
	}

	catwif(verbosity>0, "categorical columns")
	retval<-rCatsInDfr(dfr=dfr, maxFullNACatCols=maxFullNACatCols,
		howManyIfTooMany=howManyIfTooMany, onlyCategorical=FALSE,
		weightsName=weightsName, orgriName=orgriName, reweightPerRow=reweightPerRow,
		verbosity=verbosity-1,...)
		
	return(retval)
}


#Altered this from fitPredictorModel.GLoMo. Turned loop code into matrix 
#operations; known issue: what if there are no/all factor columns??
#
#IMPORTANT CHANGE: we foresee that most of the times, there will be only one
#observation per cell (i.e.: per mid).
#If this is the case, use the unpooled (co)variance matrix. This will always
#be an overestimate of the true pooled(co)variance matrix! Because this will
#only create more variance in the generated predictors, this should not affect
#the results negatively.
#A possible result is that the algorithm may converge somewhat slower (as more
#'different' data is generated. For relatively few missing values, this should
#not be an issue (especially for few missing continuous values)
#
#Because it is rather hard to check (especially for all covariances) whether
#the above is the case, we will (for now?) always use the global covmat.
# -> arranged by parameter pooledCov
#
#Food for thought: does it still make sense to use the weights then??

GLoMo<-function(dfr, weights=rep(1,dim(dfr)[1]), uniqueIdentifiersPerRow=NULL,
	separator=",", pooledCov=TRUE, verbosity=0,...)
{
	stopifnot(!.hasNA(dfr))
	numCols<-ncol(dfr)
	factorCols<-findCatColNums(dfr)
	numCat<-length(factorCols)
	stopifnot(numCols!=numCat)
	stopifnot(numCat>0)
	dfrDim<-dim(dfr)
	catwif(verbosity > 0, class(dfr), "(", dfrDim, "), nr of factorCols=",
		length(factorCols))
	nobs<-dfrDim[1]
	numCont<-dfrDim[2] - numCat

	if(is.null(uniqueIdentifiersPerRow))
	{
		catwif(verbosity > 0,
			"uniqueIdentifiersPerRow were not provided so recalculating.")
		require(addendum)
		uniqueIdentifiersPerRow<-categoricalUniqueIdentifiers(dfr,
			separator=separator, na.becomes=NA)
	}
	catwif(verbosity > 0, "uniqueIdentifiersPerRow OK")

	weights<-weights/sum(weights) #make them sum to 1

	catwif(verbosity > 0, "Unique uniqueIdentifiersPerRow.")
	uids<-uniqueCharID(uniqueIdentifiersPerRow, needSort=TRUE,
		includeOccurrence=FALSE, impNr=1)
	catwif(verbosity > 0, length(uids), "found.")
	firstOccurrenceOfEachUMidInDfr<-match(uids, uniqueIdentifiersPerRow)
	
	uniqueFactorCombinationsAndContinuousMeans<-dfr[firstOccurrenceOfEachUMidInDfr,]
	
#	factuid<-dfr[firstOccurrenceOfEachUMidInDfr, factorCols]
#holds the factor variables for each unique mid
#	colnames(factuid)<-colnames(dfr)[factorCols]
	
	#note:
	#uniqueIdentifiersPerRow holds the unique identifiers for every row in the dfr
	#uids holds the unique values!

	catwif(verbosity > 0, "Find the pihat + means")
	#2011/07/14: changed to matrix operations!
	#2011/08/10: changed to sparse matrix operations!
	rowinds<-match(uniqueIdentifiersPerRow, uids)
	colinds<-seq_along(uniqueIdentifiersPerRow)
	matw<-sparseMatrix(i=rowinds,
		j=colinds, x=weights) #note: we are certain all uniqueIdentifiersPerRow and all uids are in the data, so no need to specify dims
	contDataAsMat<-.contDataAsMat(dfr)
	pihat<-rowSums(matw)
	#2011/10/25 for the below to work, we must use _re_weighted per UID, i.e. per row!!
	urs<-unique(rowinds)
	weightsums<-sapply(urs, function(curr){sum(weights[rowinds==curr])})
	mapri2u<-match(rowinds, urs)
	rweights<-weights/weightsums[mapri2u]
	matw<-sparseMatrix(i=rowinds,
		j=colinds, x=rweights) #note: we are certain all uniqueIdentifiersPerRow and all uids are in the data, so no need to specify dims
	themeans<-as.matrix(matw %*% contDataAsMat)
	
	contCols<-seq(ncol(dfr))[-factorCols]
	contnames<-colnames(dfr)[contCols]
	uniqueFactorCombinationsAndContinuousMeans[,contCols]<-themeans

#	umeans<-cbind(pihat=pihat, themeans)
#	#umeans should now hold 1 row for every unique mid
#	#it should hold the pihats as the first col, and the means for every cont var
#	colnames(umeans)<-c("pihat", contnames)

	if(! pooledCov)
	{
		stop("GLoMo for non-pooled covariance matrix has not been implemented yet.")
	}
	catwif(verbosity > 0, "Getting centralized covariances")
	omegahat<-cov.wt(contDataAsMat, wt=weights, method="ML")$cov
	dimnames(omegahat)<-list(rowvar=contnames, colvar=contnames)
	catwif(verbosity > 0, "Got centralized covariances")

	#the conversion to data.frame is perhaps unnecessary and may be a performance
	#hog in the case of numdfr... -> check to improve on this
	#2011/08/24: changed the function so that this conversion no longer takes place.
	#now, uniqueFactorCombinationsAndContinuousMeans has the same class as dfr had.
	#note: keep in sync with combineGLoMos
	retval<-list(uid=uids, pihat=pihat, omegahat=omegahat, orgdatadim=dim(dfr),
		uniqueFactorCombinationsAndContinuousMeans=uniqueFactorCombinationsAndContinuousMeans,
		factorCols=factorCols, guidSeparator=separator, invomega=invertSymmetric(omegahat, careful=FALSE))
	class(retval)<-"GLoMo"
	return(retval)
}

getGuidData<-function(glomo, dfr, guidPerObservation=NULL, whichHaveMissingCat,
	verbosity=0)
{
	if(is.null(guidPerObservation))
	{
		catwif(verbosity>0, "guidPerObservation was not passed along")
		require(addendum)
		guidPerObservation<-categoricalUniqueIdentifiers(dfr, 
			separator=glomo$guidSeparator, na.becomes="\\d+", verbosity=verbosity-1)
	}
	else
	{
		stopifnot(length(guidPerObservation)==nrow(dfr))
	}
	#we expect match to be a lot faster + avoid lapply
	if(missing(whichHaveMissingCat)) whichHaveMissingCat<-grepl("\\d+", guidPerObservation, fixed=TRUE)

	orgWithMissing<-lapply(guidPerObservation[whichHaveMissingCat], grep, glomo$uid)
	orgWithoutMissing<-as.list(match(guidPerObservation[! whichHaveMissingCat], glomo$uid))
	possibleGlomoGuidPerObs<-list(length(guidPerObservation))
	#not certain if the below assignments will work...
	possibleGlomoGuidPerObs[whichHaveMissingCat]<-orgWithMissing
	possibleGlomoGuidPerObs[!whichHaveMissingCat]<-orgWithoutMissing
	names(possibleGlomoGuidPerObs)<-names(guidPerObservation)

	retval<-list(guidPerObservation=guidPerObservation,
		possibleGlomoGuidPerObs=possibleGlomoGuidPerObs,
		separator=glomo$guidSeparator)
	class(retval)<-"GuidData"
	return(retval)
}


#is only called from within predict.GLoMo to handle the unlikely case
#of a row that has no matching row(s) in the GLoMo uids
#Note: this really only is unlikely in the setting that I plan to use it in,
#i.e.: impute in the original data from a GLoMo fit to it.
randomFillAndRepeatDataRow<-function(currow, obsneeded, levelslist, newdata)
{
	fcols<-which(sapply(levelslist, length) > 0)
	curnacols<-which(is.na(currow))
	retval<-currow[rep(1, obsneeded),]
	for(curnacol in curnacols)
	{
		if(curnacol %in% fcols)
		{
			#pick a completely random level
			retval[,curnacol]<-sample.int(length(levelslist[[curnacol]]),
				size=obsneeded, replace=TRUE)
		}
		else
		{
			#whatever: unweighted mean and sd
			coldata<-newdata[,curnacol, drop=TRUE]
			uwmean<-mean(coldata, na.rm=TRUE)
			uwsd<-sd(coldata, na.rm=TRUE)
			retval[,curnacol]<-rnorm(obsneeded, mean=uwmean, sd=uwsd)
		}
	}
	return(retval)
}

reusableDataForGLoMoSampling<-function(glomo, dfr, forrows=seq(nrow(dfr)),
	guiddata=NULL, verbosity=0)
{
	if(is.null(guiddata) | (!is(guiddata, "GuidData")))
	{
		catwif(verbosity > 0,
			"guiddata were not (completely) provided so recalculating.")
		guiddata<-getGuidData(glomo, dfr, guidPerObservation=guiddata)
#		cattif(verbosity > 10, "predict.GLoMo: guiddata str: ")
#		if(verbosity > 10) str(guiddata)
	}
	catwif(verbosity > 0, "getting reusable data.")
	catcols<-glomo$factorCols
	numCat<-length(catcols)
	cntcols<-seq(ncol(dfr))[-catcols]
	numCont<-length(cntcols)
	
	perrow<-lapply(seq_along(forrows), function(i){
			currowi<-forrows[i]
			#note: improvement possible: if no missing values in a row then these
			#calculations are not needed, just return an empty list
			if((length(forrows) > 1) & (verbosity > 1))
			{
				catw("working on row", match(currowi, forrows), "/", length(forrows))
			}
			currow<-dfr[currowi,]
			glomorowsforcurrow<-guiddata$possibleGlomoGuidPerObs[[currowi]]
			whichCntColNotNA<-which(!is.na(currow[, cntcols, drop=TRUE]))
			whichCntColNA<-(1:numCont)[-whichCntColNotNA]
			presentCntColsInDfr<-cntcols[whichCntColNotNA]
			missingCntColsInDfr<-cntcols[whichCntColNA]
			if(verbosity > 5)
			{
				catw("currowi:", currowi)
				catw("glomorowsforcurrow:", glomorowsforcurrow)
				catw("whichCntColNotNA:", whichCntColNotNA)
				catw("whichCntColNA:", whichCntColNA)
				catw("presentCntColsInDfr:", presentCntColsInDfr)
				catw("missingCntColsInDfr:", missingCntColsInDfr)
			}
			if(length(whichCntColNotNA) > 0)
			{
				omega<-glomo$omegahat[whichCntColNotNA,whichCntColNotNA, drop=FALSE]

				#note in the 1,2 notation, 1 refers to missing data (NA), while 2 refers
				#to present data (notNA)
				if((length(whichCntColNotNA) == numCont) & (exists("invomega", glomo)))
				{
					invSig22<-glomo$invomega
					catwif(verbosity > 1, "Using glomo$invomega of dimension", dim(invSig22))
				}
				else
				{
					invSig22<-invertSymmetric(omega, careful=FALSE)
					catwif(verbosity > 1, "Inverted omega of dimension", dim(invSig22))
				}
				
				presentXs<-matrix(unlist(currow[, presentCntColsInDfr, drop=TRUE]), nrow=1)
				if(length(whichCntColNA) != numCont)
				{
					sig11<-glomo$omegahat[whichCntColNA,whichCntColNA, drop=FALSE]
					sig12<-glomo$omegahat[whichCntColNA,whichCntColNotNA, drop=FALSE]
					a<-currow[,presentCntColsInDfr, drop=TRUE]
					useSigma<-sig11 - sig12 %*% invSig22 %*% t(sig12)
					sigLeft<-sig12 %*% invSig22
				}
				else
				{
					a<-NULL
					useSigma<-glomo$omegahat
					sigLeft<-NULL
				}
			}

			if(length(glomorowsforcurrow) > 1)
			{
				if(length(whichCntColNotNA) == 0)
				{
					#if all continuous values are missing, no need for conditional on them
					deltas<-glomo$pihat[glomorowsforcurrow]
				}
				else
				{
					deltas<-sapply(glomorowsforcurrow, function(curcatrow){
							pi.c<-glomo$pihat[curcatrow]
							relvMus<-matrix(unlist(glomo$uniqueFactorCombinationsAndContinuousMeans[
								curcatrow,presentCntColsInDfr,drop=TRUE]), ncol=1)
							#relvMus contains for each relevant cell and for each continuous column with missing data
							# the mean value.
							#Note: typically, in the first iteration of an EMLasso, these will all be equal, as the 
							#missing values are imputed with the marginal mean
							
# 							if(verbosity > 5)
# 							{
# 								catw("dimension and class of invSig22:", dim(invSig22), "->", class(invSig22))
# 								catw("dimension and class of relvMus:", dim(relvMus), "->", class(relvMus))
# 								catw("dimension and class of presentXs:", dim(presentXs), "->", class(presentXs))
# # 								.debug.tmp$invSig22<<-invSig22
# # 								.debug.tmp$relvMus<<-relvMus
# # 								.debug.tmp$presentXs<<-presentXs
# 							}
							partr<-invSig22 %*% relvMus         #notation refers to near page
							part1<-presentXs %*% partr        #349 in Analysis of Incomplete
							part2<- -1/2*t(relvMus) %*% partr   #Multivariate Data
							delta.c<-part1+part2+log(pi.c)
							return(delta.c)
						})
					#apparantly, occasionally these deltas are _really_ big, which
					#prevents the exp from working.
					#solution: subtract a common term from all of them (= amounts to 
					#   dividing them by a common factor after exponentiation)
					maxd<-max(deltas)
					if(maxd > 100)
					{
						subtr<-maxd-100
						deltas<-deltas - subtr
					}
					#Note: if all relvMus were equal, then so will all part1 and part2,
					#so: every individual delta is really a constant times pi.c
					deltas<-exp(deltas) #p349 in Analysis of Incomplete Multivariate Data
				}
				deltasum<-sum(deltas)
				probs<-deltas/deltasum
				#Note if all relvMus were equal, then the common factor is divided away
				#so probs are simply rescaled versions of pi.c (the cell probabilities)
				#so that they sum to 1
			}
			else
			{
				probs<-NULL
			}

			retval<-list(
				a=a,                   #notation refers to near page 349 in
				useSigma=useSigma,     #Analysis of Incomplete Multivariate Data
				sigLeft=sigLeft,       #
				probs=probs,
				whichCntColNotNA=whichCntColNotNA,
				whichCntColNA=whichCntColNA,
				presentCntColsInDfr=presentCntColsInDfr,
				missingCntColsInDfr=missingCntColsInDfr
			)
			class(retval)<-"ReusableDataForGLoMoSamplingForOneRow"
			return(retval)
		})
	retval<-list(guiddata=guiddata, forrows=forrows, perrow=perrow)
	class(retval)<-"ReusableDataForGLoMoSampling"
	return(retval)
}

#If newdata is NULL, nobs means the number of observations to predict (generate)
#if not, it holds the number of observations to generate per original 
#observation that holds missing data
predict.GLoMo<-function(object, nobs=1, newdata=NULL, forrows=seq(nrow(newdata)),
	reusabledata=NULL, returnRepeats=FALSE, returnSelectedGlomoRows=FALSE,
	verbosity=0,...)
{
	#NOTE: the combinations of categorical values in GLoMo are always unique!
	glomo<-object #to make it more recognizable in the following code
	if(is.null(newdata) & (!is.null(reusabledata)))
	{
		warning("Don't provide reusabledata if newdata is NULL. It will be ignored.")
		reusabledata<-NULL #just in case
	}
	cntcols<-seq(ncol(newdata))[-glomo$factorCols]
	if(is.null(newdata))
	{
		catwif(verbosity > 1, "_fully_ predicting dataset.")
		probs<-glomo$pihat
		howofteniseachglomorowsampled<-as.vector(rmultinom(1, nobs, prob=probs))
		glomorowsforcurrow<-which(howofteniseachglomorowsampled > 0)
		howofteniseachglomorowsampled<-howofteniseachglomorowsampled[glomorowsforcurrow]
		glomorowschosen<-rep(glomorowsforcurrow, howofteniseachglomorowsampled)
		retval<-glomo$uniqueFactorCombinationsAndContinuousMeans[glomorowschosen,]
		firstposofeachglomorowinresult<-cumsum(c(1, howofteniseachglomorowsampled))[
			-(length(howofteniseachglomorowsampled)+1)]
		lastposofeachglomorowinresult<-cumsum(howofteniseachglomorowsampled)
		for( i in seq_along(glomorowsforcurrow))
		{
			catwif(verbosity > 2,
				"working on glomorow", i, "/", length(glomorowsforcurrow))
			curglomorowi<-glomorowsforcurrow[i]
			howmanysamplesforcurglomorow<-howofteniseachglomorowsampled[i]
			useMu<-unlist(glomo$uniqueFactorCombinationsAndContinuousMeans[
				curglomorowi,cntcols, drop=TRUE])
			gen<-qrmvnorm(howmanysamplesforcurglomorow, mean=useMu,sigma=glomo$omegahat)
			allposinresforcurglomorow<-seq(from=firstposofeachglomorowinresult[i],
				to=lastposofeachglomorowinresult[i])
			retval[allposinresforcurglomorow, cntcols]<-gen
		}
		#note: in this case, returnRepeats has no meaningful interpretation
		if(returnSelectedGlomoRows)
		{
			return(list(predicted=retval, glomorowsused=glomorowschosen))
		}
		else
		{
			return(retval)
		}
	}
	#now we also check that the class of the newdata matches the glomo!!
	#if it doesn't, transform it.
	if(any(class(newdata) != class(glomo$uniqueFactorCombinationsAndContinuousMeans)))
	{
		warning("Unmatching classes between glomo and dfr. Will try to coerce dfr.")
		if(inherits(glomo$uniqueFactorCombinationsAndContinuousMeans, "data.frame"))
		{
			catwif(verbosity > 1, "coercing newdata to data.frame.")
			newdata<-as.data.frame(newdata)
		}
		else if(inherits(glomo$uniqueFactorCombinationsAndContinuousMeans, "numdfr"))
		{
			catwif(verbosity > 1, "coercing newdata to numdfr.")
			newdata<-numdfr(newdata)
		}
	}
	if(length(nobs) != length(forrows))
	{
		if(length(nobs) != 1) stop("Unsupported nobs passed along.")
		catwif(verbosity > 0, "readjusting nobs.")
		nobs<-rep(nobs, length(forrows))
	}
	if(is.null(reusabledata) | (!is(reusabledata, "ReusableDataForGLoMoSampling")))
	{
		catwif(verbosity > 0,
			"reusabledata were not (completely) provided so recalculating.")
		reusabledata<-reusableDataForGLoMoSampling(glomo=glomo, dfr=newdata,
			forrows=forrows, guiddata=reusabledata, verbosity=verbosity-1)
	}
	levelslist<-allLevels(newdata)
	predPerRow<-lapply(forrows, function(currowi){
			howManiethRow<-match(currowi, forrows)
			if((length(forrows) > 1) & (verbosity > 1))
			{
				catw("working on row", howManiethRow, "/", length(forrows))
			}
			currow<-newdata[currowi, ]
			indexInReusableData<-match(currowi, reusabledata$forrows)
			curreusabledata<-reusabledata$perrow[[indexInReusableData]]
			curguiddata<-reusabledata$guiddata$possibleGlomoGuidPerObs[[
				indexInReusableData]]
			if(sum(is.na(currow)) == 0)
			{
				catwif(verbosity > 1,
					"no data was missing in the original row (", currowi, ").")
				retval<-newdata[currowi, ]
				if(returnSelectedGlomoRows)
				{
					catwif(verbosity > 5, "will use as glomorowsused:", curguiddata)
					return(list(predicted=retval, glomorowsused=curguiddata))
					#note, in this case, curreusabledata should hold only one value!
				}
				else
				{
					return(retval)
				}
			}
			#which o/t rows in the pimeanhat can be chosen is stored in catrowsallowed
			glomorowsforcurrow<-reusabledata$guiddata$possibleGlomoGuidPerObs[[currowi]]
			howmanysamplesforcurrow<-nobs[howManiethRow]
			if(length(glomorowsforcurrow) == 0)
			{
				#in fact this should never happen
				catwif(verbosity > 1, "row has no matching glomorows.")
				warning(paste("predict.GLoMo: Row passed along for which there are no valid predictions. There is no matching combination of categories in the GLoMo object. The rownumber was", currowi, ". Will simply pick random values."))
				retval<-randomFillAndRepeatDataRow(currow=currow, 
					obsneeded=howmanysamplesforcurrow, levelslist=levelslist,
					newdata=newdata)
				if(returnSelectedGlomoRows)
				{
					catwif(verbosity > 5, "will use as glomorowsused:", character(0))
					return(list(predicted=retval, glomorowsused=character(0)))
				}
				else
				{
					return(retval)
				}
			}
			else if(length(glomorowsforcurrow) == 1)
			{
				catwif(verbosity > 1, "row has 1 matching glomorow.")
				howofteniseachglomorowsampled<-howmanysamplesforcurrow
			}
			else
			{
				catwif(verbosity > 1, "row has multiple matching glomorow.")
				probs<-curreusabledata$probs #these probabilities are conditional on the
					#data that is present in currow (categorical AND continuous)
				catwif(verbosity > 5, "their probabilities are: ", probs)
				catwif(verbosity > 5,
					"and we need: ", howmanysamplesforcurrow, "samples.")
				howofteniseachglomorowsampled<-as.vector(rmultinom(1,
					howmanysamplesforcurrow, prob=probs))
			}
			#by now, howofteniseachglomorowsampled holds how many times each of the 
			#rows indicated by glomorowsforcurrow are selected
			#we remove unselected rows from both:
			glomorowsforcurrow<-glomorowsforcurrow[howofteniseachglomorowsampled>0]
			howofteniseachglomorowsampled<-howofteniseachglomorowsampled[
				howofteniseachglomorowsampled>0]
			#now we get a vector of all the (possibly repeated) row nrs in pimeanhat
			glomorowschosen<-rep(glomorowsforcurrow, howofteniseachglomorowsampled)
#			cattif(verbosity > 5, "	predict.GLoMo: glomorowschosen:", glomorowschosen)
			#And a starting point for the return values
			retval<-glomo$uniqueFactorCombinationsAndContinuousMeans[glomorowschosen,]
#			cattif(verbosity > 5, "	predict.GLoMo: retval structure for now:")
#			if(verbosity > 5) str(retval)
			#if no continuous data missing, we always use the values in original row:
			cntcols<-seq(ncol(newdata))[-glomo$factorCols]
#			cattif(verbosity > 5, "	predict.GLoMo: cont col indexes in dfr:", cntcols)
			if(sum(is.na(currow[,cntcols, drop=TRUE])) == 0)
			{
				catwif(verbosity > 1, "no missing continuous data!")
				orgcont<-currow[rep(1, howmanysamplesforcurrow),cntcols]
#				if(verbosity > 5)
#				{
#					catt("orgcont str:")
#					str(orgcont)
#					catt("retval str:")
#					str(retval)
#					catt("cntcols: ", cntcols)
#				}
				retval[,cntcols]<-orgcont #replace all continuous values with original
#				catwif(verbosity > 1, "passed no missing continuous data!")
			}
			else
			{
				#We also need to sample continuous values (conditional on the
				#categoricals) if we get here.
				#First, fill out the values that will not change (i.e. that were present
				#in the original row)
				retval[,curreusabledata$presentCntColsInDfr]<-currow[
					rep(1, howmanysamplesforcurrow),curreusabledata$presentCntColsInDfr]
				firstposofeachglomorowinresult<-cumsum(c(1, howofteniseachglomorowsampled))[
					seq_along(howofteniseachglomorowsampled)]
				lastposofeachglomorowinresult<-cumsum(howofteniseachglomorowsampled)
				for( i in seq_along(glomorowsforcurrow))
				{
					catwif(verbosity > 2, "working on glomorow", i, "/",
						length(glomorowsforcurrow))
					curglomorowi<-glomorowsforcurrow[i]
					howmanysamplesforcurglomorow<-howofteniseachglomorowsampled[i]
					catwif(verbosity > 5, "Need", howmanysamplesforcurglomorow, "samples")
					useMu<-unlist(glomo$uniqueFactorCombinationsAndContinuousMeans[
						curglomorowi,cntcols, drop=TRUE])
# 					#TMPNS
# 					catw("Original useMu:")
# 					print(useMu)
# 					#TMPNS
					if(length(curreusabledata$whichCntColNA) != length(cntcols))
					{
						#so part of the continuous data is already present
						#	(and reused in each sample)
						mu1<-useMu[curreusabledata$whichCntColNA]
						mu2<-useMu[curreusabledata$whichCntColNotNA]
						useMu<-unlist(mu1 + as.vector(curreusabledata$sigLeft %*%
							matrix(unlist(curreusabledata$a - mu2), ncol=1)))
# 						#TMPNS
# 						catw("sigleft:")
# 						print(curreusabledata$sigLeft)
# 						catw("a-mu2:")
# 						print(matrix(unlist(curreusabledata$a - mu2), ncol=1))
# 						#TMPNS
						#note: a is the value in the row, mu2 is the mean from the GLoMo. It is common in the 
						#first iteration of an EMLasso that these are the same (as missing values are imputed
						#with the mean).
# 						#TMPNS
# 						catw("Adapted useMu (to already present continuous data):")
# 						print(useMu)
# 						#TMPNS
					}
					catwif(verbosity > 2,
						"Past parameter calculation. Will now generate conditional normal.")
					gen<-qrmvnorm(howmanysamplesforcurglomorow, mean=useMu,
						sigma=curreusabledata$useSigma) #normal, each row = one simulation
# 					#TMPNS
# 					catw("Generated normal data")
# 					print(gen)
# 					#TMPNS
					allposinresforcurglomorow<-seq(from=firstposofeachglomorowinresult[i],
						to=lastposofeachglomorowinresult[i])
					retval[allposinresforcurglomorow,
						curreusabledata$missingCntColsInDfr]<-gen
				}
			}
			if(returnSelectedGlomoRows)
			{
				catwif(verbosity > 5, "will use as glomorowsused:", glomorowschosen)
				return(list(predicted=retval, glomorowsused=glomorowschosen))
			}
			else
			{
				return(retval)
			}
		})
	#debug.predPerRow<<-predPerRow
	#now recombine the 'subdatasets' in predPerRow, just like it's done at
	#the end of rCatsInDfr
	if(returnSelectedGlomoRows)
	{
		result<-combineSimilarDfrList(lapply(predPerRow, "[[", "predicted"))
		#this should be OK for numdfr, but probably slow for data.frame
		glomorowsused<-do.call(c, lapply(predPerRow, "[[", "glomorowsused"))
	}
	else
	{
		result<-combineSimilarDfrList(predPerRow)
		#this should be OK for numdfr, but probably slow for data.frame
	}
	if(!is.null(rownames(predPerRow)))
	{
		rownames(predPerRow)<-postfixToMakeUnique(rownames(predPerRow))
	}
	if(returnRepeats)
	{
		numRepPerRow<-sapply(seq_along(forrows), function(curi){
				currow<-newdata[forrows[curi], ]
				if(sum(is.na(currow)) == 0) return(1) else return(nobs[curi])
			})
		names(numRepPerRow)<-as.character(forrows)
		retval<-list(predicted=result, numRepPerRow=numRepPerRow)
		if(returnSelectedGlomoRows) retval$glomorowsused<-glomorowsused
		return(retval)
	}
	if(returnSelectedGlomoRows)
	{
		return(list(predicted=result, glomorowsused=glomorowsused))
	}
	return(result)
}

#will have to see whether this really provides a speedup
#maybe doing the grep again with the reduced uids will be faster than this
updateGuidData<-function(oldglomo, newglomo, oldrowsused=seq(nrow(oldglomo$uid)),
	oldguiddata)
{
	oldrowsused<-unique(oldrowsused)
	oldtonew<-sapply(oldglomo$uid[oldrowsused], function(curolduid){
			match(curolduid, newglomo$uid)
		})

	oldguiddata$possibleGlomoGuidPerObs<-lapply(oldguiddata$possibleGlomoGuidPerObs,
		function(oldrows){
			oldtonew[match(oldrows, oldrowsused)]
		})
	return(oldguiddata)
}

validateFunction.acceptall<-function(attempts, otherData, forrow, verbosity=0)
{
	return(seq(nrow(attempts$predicted)))
}

validateFunction.useprob<-function(attempts, otherData, forrow, verbosity=0)
{
	theprob<-otherData[forrow]
	if(is.null(theprob) || (theprob < 0) || (theprob > 1))
	{
		catwif(verbosity > 0, "invalid probability for row", forrow, ". Using 50%.")
		theprob<-0.5
	}
	res<-sample.int(2, size=nrow(attempts$predicted), replace=TRUE,
		prob=c(1-theprob, theprob))
	return(which(res==2))
}

validateFunction.default<-function(attempts, otherData, forrow, verbosity=0)
{
	if(is.null(otherData)) otherData<-rep(0.5, max(forrow))
	return(validateFunction.useprob(attempts, otherData, forrow,
		verbosity=verbosity))
}

#validateFunction, like the examples above, is a function that must return the
#		indices (rownumbers) of rows that are accepted
#->only supported for 1 row at a time


predict.conditional<-function(object, nobs=1, dfr, forrows, 
	validateFunction=validateFunction.default, guiddata=NULL,
	otherData=NULL, initialSuccessRateGuess=0.5, verbosity=0,
	minimumSuccessRate=0.001,...) UseMethod("predict.conditional")

predict.conditional.GLoMo<-function(object, nobs=1, dfr, forrows, 
	validateFunction=validateFunction.default, guiddata=NULL,
	otherData=NULL, initialSuccessRateGuess=0.5, verbosity=0,
	minimumSuccessRate=0.001,...)
{
	if(length(forrows)!=1) stop("In the GLoMo version of predict.conditional, only one row should be passed!")
	forrow<-forrows #This method used to have a parameter forrow -> makes it easier like this
	glomo<-object #to make it more recognizable in the following code
	catwif(verbosity > 0, "for row", forrow)
	if(! .hasNA(dfr[forrow,]))
	{
		catwif(verbosity > 0, "no NAs found")
		if(is.null(guiddata))
		{
			guiddata<-getGuidData(glomo, dfr[forrow,], guidPerObservation=NULL)
			guidToFind<-guiddata$guidPerObservation[1]
		}
		else
		{
			guidToFind<-guiddata$guidPerObservation[forrow]
		}
		return(list(predicted=dfr[forrow, ], glomorowsused=match(guidToFind,
			glomo$uid)))
	}
	successes<-0
	attempts<-0
	successRateSoFar<-initialSuccessRateGuess
	tryAtATime<-as.integer((nobs-successes) / successRateSoFar)
	if(tryAtATime < (nobs-successes)) tryAtATime<-(nobs-successes)
	if(tryAtATime < 1) tryAtATime<-1
	catwif(verbosity > 0, "get reusable Data")
	reusabledata<-reusableDataForGLoMoSampling(glomo=glomo, dfr=dfr,
		forrows=forrow, guiddata=guiddata, verbosity=verbosity-1)
	acceptedRows<-NULL
	acceptedGLoMoRowsRows<-NULL
	howManyLoops<-0
	lastLoop<-FALSE
	while(successes < nobs)
	{
		howManyLoops<-howManyLoops+1
		catwif(verbosity > 1, "start of loop", howManyLoops)
		catwif(verbosity > 5, "will try", tryAtATime, "unconditional predictions")
		catwif(verbosity > 5,
			"successes:", successes, "/", attempts, "->", successRateSoFar)
		newAttempts<-predict(glomo, nobs=tryAtATime, newdata=dfr, forrows=forrow,
			reusabledata=reusabledata, returnRepeats=FALSE, 
			returnSelectedGlomoRows=TRUE, verbosity=verbosity-1)
		catwif(verbosity > 1, "will validate results now", howManyLoops)
		newAttemptValidity<-validateFunction(newAttempts, otherData, forrow,
			verbosity=verbosity-1)
		newlyAccepted<-length(newAttemptValidity)
		if(lastLoop && (newlyAccepted < (nobs - successes)))
		{
			#we had sworn that this would be the last loop, so 'accept' randomly
			#what is still needed.
			catwif(verbosity > 0, "Emergency break: randomly accepting some observations because convergence did not occur yet", howManyLoops)
			stillOpen<-seq(tryAtATime)
			if(newlyAccepted > 0) stillOpen<-stillOpen[-newAttemptValidity]
			fakeAccepted<-sample(stillOpen, nobs - successes - newlyAccepted)
			newAttemptValidity<-sort(c(newAttemptValidity, fakeAccepted))
			newlyAccepted<-nobs - successes
		}
		if(newlyAccepted > 0)
		{
			if(newlyAccepted > nobs - successes)
			{
				#note: there used to be seq instead of sample.int here, but the attempts 
				#are grouped per glomorow, so in some cases this is a bad idea...
				newAttemptValidity<-newAttemptValidity[sample.int(newlyAccepted, nobs - successes, replace=FALSE) ]
				newlyAccepted<-nobs - successes
			}
			successes<-successes+newlyAccepted
			if(is.null(acceptedRows))
			{
				acceptedRows<-newAttempts$predicted[newAttemptValidity,]
				acceptedGLoMoRowsRows<-newAttempts$glomorowsused[newAttemptValidity]
			}
			else
			{
				acceptedRows<-rbind(acceptedRows, newAttempts$predicted[
					newAttemptValidity,])
				acceptedGLoMoRowsRows<-c(acceptedGLoMoRowsRows,
					newAttempts$glomorowsused[newAttemptValidity])
			}
		}
		attempts<-attempts+tryAtATime
		if(newlyAccepted == 0)
		{
			successRateSoFar<-successRateSoFar/2
		}
		else
		{
			successRateSoFar<-successes/attempts
		}
		if((newlyAccepted == 0) & (successRateSoFar < minimumSuccessRate))
		{
			#there have been too many failures already, so ensure not too many get
			#generated + make sure the rest is accepted anyhow in the next loop.
			successRateSoFar<-minimumSuccessRate
			lastLoop<-TRUE
		}

		tryAtATime<-as.integer((nobs-successes) / successRateSoFar)
		#cattif(verbosity > 5, "Finally: tryAtATime", tryAtATime)
		if(tryAtATime < (nobs-successes)) tryAtATime<-(nobs-successes)
		if(tryAtATime < 1) tryAtATime<-1
	}
	return(list(predicted=acceptedRows, glomorowsused=acceptedGLoMoRowsRows))
}

predict.conditional.allrows.GLoMo<-function(object, nobs=1, dfr, 
	forrows=seq(nrow(dfr)), validateFunction=validateFunction.default, 
	guiddata=NULL, otherData=NULL, initialSuccessRateGuess=0.5, verbosity=0,
	minimumSuccessRate=0.001, ...)
{
	glomo<-object #to make it more recognizable in the following code
	if(length(nobs) != length(forrows))
	{
		if(length(nobs) != 1) stop("Unsupported nobs passed along.")
		catwif(verbosity > 0, "readjusting nobs.")
		nobs<-rep(nobs, length(forrows))
	}
	if(is.null(guiddata) | (!is(guiddata, "GuidData")))
	{
		catwif(verbosity > 0,
			"guiddata were not (completely) provided so recalculating.")
		guiddata<-getGuidData(glomo, dfr, guidPerObservation=guiddata)
	}
	#note: overhead in this lapply call takes 2 seconds!!
	#that means: tim spent outside of the predict.conditional.GLoMo !!
	predPerRow<-lapply(seq_along(forrows), function(currowi){
			predict.conditional.GLoMo(object=glomo, nobs=nobs[currowi],
				dfr=dfr, forrows=forrows[currowi], validateFunction=validateFunction, 
				guiddata=guiddata,otherData=otherData,
				initialSuccessRateGuess=initialSuccessRateGuess, verbosity=verbosity-1,
				minimumSuccessRate=minimumSuccessRate)
		})
	result<-combineSimilarDfrList(lapply(predPerRow, "[[", "predicted"))
	#this should be OK for numdfr, but probably slow for data.frame
	repsPerRow<-sapply(predPerRow, function(resCurRow){nrow(resCurRow$predicted)})
	glomorowsused<-do.call(c, lapply(predPerRow, "[[", "glomorowsused"))
	return(list(predicted=result, glomorowsused=glomorowsused, repsperrow=repsPerRow))
}

#idea to make numdfr even more efficient, especially in terms of memory:
#remember which 'original' row each row refers to, and keep only the 'replacement'
#values !!! This way, when a row is repeated 20 times and only one missing value
#was there, there only needs to be one copy of the repeated values, and 20 
#different 'replacement' values!!! Need to check this!
#However: this may render the name 'numdfr' somewhat unsatisfying
#Note: this was implemented to a certain extent in numdfr.rep


#note: quite a few assumptions are made on the uniformity of the GLoMos passed in!!
combineGLoMos<-function(..., listOfGLoMos=NULL, verbosity=0)
{
	allGLoMos<-c(list(...), listOfGLoMos)
	N<-length(allGLoMos)
	catwif(verbosity > 0, "length of allGLoMos: ", N)
	numUidsPerGLoMo<-sapply(allGLoMos, function(curGLoMo){length(curGLoMo$uid)})
	catwif(verbosity > 0, "numUidsPerGLoMo: ", numUidsPerGLoMo)
	probFactorPerGLoMo<-ifelse(numUidsPerGLoMo==0, 1, 1/numUidsPerGLoMo)
	
	catwif(verbosity > 0, "allUids")
	allUids<-do.call(c, lapply(allGLoMos, "[[", "uid"))
	catwif(verbosity > 0, length(allUids), "Uids found.")
	uniqueUids<-sort(unique(allUids))
	catwif(verbosity > 0, length(uniqueUids), "unique Uids found.")
	catwif(verbosity > 0, "matchPerUniqueUid")
	matchPerUniqueUid<-sapply(allGLoMos, function(curGLoMo){match(uniqueUids,
		curGLoMo$uid)})
	#matchPerUniqueUid now holds a column per GLoMo and a row per uniqueUid.
	#if the uniqueUid does not occur in that GLoMo, it holds NA, otherwise it
	#holds the rowindex of that uniqueUid in that GLoMos uid
	catwif(verbosity > 0, "newPihat")
	newPihat<-apply(matchPerUniqueUid, 1, function(curMatchRow){
			nonNas<-!is.na(curMatchRow)
			pis<-mapply(function(ri, glomo, prf){glomo$pihat[ri]*prf},
				curMatchRow[nonNas], allGLoMos[nonNas], probFactorPerGLoMo[nonNas])
			curpi<-sum(unlist(pis))
			return(curpi)
		})
	#very _temporary_ solution: simply average out the covariance matrices!!
	#based on the concept of pooled variance (?):
	#http://en.wikipedia.org/wiki/Pooled_variance
	#may need to somehow incorporate sample size (or can I rely on nearly equal
	#sample sizes for now ?
	catwif(verbosity > 0, "combinedOmegaHat")
	omegaHatList<-lapply(allGLoMos, "[[", "omegahat")
	orgDim<-dim(allGLoMos[[1]]$omegahat)
	combinedOmegaHat<-array(do.call(c, omegaHatList), dim=c(orgDim, N))
	newOmegahat<-rowMeans(combinedOmegaHat, dims=length(orgDim))
	#another dubious one: need to think this over! -> probably not really an issue
	catwif(verbosity > 0, "allorgdims")
	allorgdims<-sapply(allGLoMos, "[[", "orgdatadim")
	newOrgdims<-rowMeans(allorgdims)
	#disregarding sample size again below
	colcount<-ncol(allGLoMos[[1]]$uniqueFactorCombinationsAndContinuousMeans)
	newFactorCols<-allGLoMos[[1]]$factorCols
	cntcls<-seq(colcount)
	if(length(newFactorCols) > 0) cntcls<-cntcls[-newFactorCols]
	catwif(verbosity > 0, "newUniqueFactorCombinationsAndContinuousMeans")
	newUniqueFactorCombinationsAndContinuousMeans<-apply(matchPerUniqueUid, 1,
		function(curMatchRow){
			nonNas<-!is.na(curMatchRow)
			usedRowList<-mapply(
				function(ri, glomo){
					rv<-glomo$uniqueFactorCombinationsAndContinuousMeans[ri,,drop=FALSE]
					return(rv)
					},
				curMatchRow[nonNas], allGLoMos[nonNas], SIMPLIFY=FALSE)
			retrow<-usedRowList[[1]]
			cntmat<-sapply(usedRowList,
				function(curnumdfr){return(unlist(curnumdfr[1, cntcls, drop=TRUE]))})
			#cntmat now holds a matrix with 1 row per cnt var and one col per 'used row'
			if(is.null(dim(cntmat)))
			{
				#This is what happens if there was only one continuous variable
				retrow[1,cntcls]<-mean(cntmat)
			}
			else
			{
				retrow[1,cntcls]<-rowMeans(cntmat)
			}
			return(retrow)
		})
	catwif(verbosity > 0, "newUniqueFactorCombinationsAndContinuousMeans combine")
	newUniqueFactorCombinationsAndContinuousMeans<-combineSimilarDfrList(
		newUniqueFactorCombinationsAndContinuousMeans)
	newGuidSeparator<-allGLoMos[[1]]$guidSeparator
	
	retval<-list(uid=uniqueUids, pihat=newPihat, omegahat=newOmegahat, 
		orgdatadim=newOrgdims,
		uniqueFactorCombinationsAndContinuousMeans=newUniqueFactorCombinationsAndContinuousMeans,
		factorCols=newFactorCols, guidSeparator=newGuidSeparator, invomega=invertSymmetric(newOmegahat, careful=FALSE))
	class(retval)<-"GLoMo"
	return(retval)
}

if(FALSE)
{
	require(addendum)
	require(NumDfr)
	setwd("C:/users/nisabbe/Documents/My Dropbox/Doctoraat/Bayesian Lasso")
	load("C:\\Users\\nisabbe\\Documents\\My Dropbox\\Doctoraat\\Bayesian Lasso\\GLoMo.RData")
	source("GLoMo.R")
	save.image("C:\\Users\\nisabbe\\Documents\\My Dropbox\\Doctoraat\\Bayesian Lasso\\GLoMo.RData")
}











if(FALSE)
{
	require(addendum)
	require(NumDfr)
	setwd("C:/users/nisabbe/Documents/My Dropbox/Doctoraat/Bayesian Lasso")
	load("C:\\Users\\nisabbe\\Documents\\My Dropbox\\Doctoraat\\Bayesian Lasso\\GLoMo.RData")
	source("GLoMo.R")
	save.image("C:\\Users\\nisabbe\\Documents\\My Dropbox\\Doctoraat\\Bayesian Lasso\\GLoMo.RData")
}




#SECTION: testing code
if(FALSE)
{
	require(addendum)
	aDfr<-generateTypicalIndependentDfr(100,100,150,catProbs=randomProbabilities,
		minn=2, maxn=4)
	aDfr.MD<-randomNA(aDfr, 0.05)

	aNDfr.MD<-numdfr(aDfr.MD)

	catw("Conversion 1 way time used: ", ttxt(system.time(tst<-numdfr(aDfr.MD))))
	#Conversion one way time used:  user: 0.05, system: 0.00, elapsed: 0.04
	#(hardly worth the mention!)

	catw("Full time used: ", ttxt(system.time(aNDfr.RF<-rCatsInDfr(aNDfr.MD,
		verbosity=0))))
	#Current implementation:
	#Full time used:  user: 2.58, system: 0.03, elapsed: 2.62
	#(a lot better than it was!!)

	catw("Full time used: ", ttxt(system.time(aDfr.RF<-rCatsInDfr(aDfr.MD,
		verbosity=0))))
	#This is my best implementation so far based on data.frame:
	#Full time used:  user: 22.22, system:  2.64, elapsed: 24.98




	require(addendum)
	require(NumDfr)

	aDfr.RFC<-rCatsAndCntInDfr(aDfr.MD, verbosity=1)
	colnames(aDfr.RFC)
	.hasNA(aDfr.RFC[,seq(ncol(aDfr.RFC)-2)])
	aNDfr.RFC<-rCatsAndCntInDfr(aNDfr.MD, verbosity=1)
	colnames(aNDfr.RFC)
	.hasNA(aNDfr.RFC[,seq(ncol(aNDfr.RFC)-2)])



	aGLoMo.RF<-GLoMo(aDfr.RFC[,seq(ncol(aDfr.RFC)-2)],
		weights=aDfr.RFC[,"weights", drop=TRUE], verbosity=10)
	aNDfr.RFC2<-numdfr(aDfr.RFC)
	aGLoMo.RF.N2<-GLoMo(aNDfr.RFC2[,seq(ncol(aNDfr.RFC2)-2), returnAsMatrix = FALSE],
		weights=aNDfr.RFC2[,"weights", drop=TRUE], verbosity=10)

	aGLoMo.RF.N<-GLoMo(aNDfr.RFC[,seq(ncol(aNDfr.RFC)-2), returnAsMatrix = FALSE],
		weights=aNDfr.RFC[,"weights", drop=TRUE], verbosity=10)



	aNDfr.MD.guids<-getGuidData(glomo=aGLoMo.RF.N, dfr=aNDfr.MD,
		guidPerObservation=NULL)
	a.nsamplesperrow<-sample.int(10, size=nrow(aNDfr.MD), replace=TRUE)
	aNDfr.MD.pred<-predict(aGLoMo.RF.N, nobs=a.nsamplesperrow, newdata=aNDfr.MD,
		reusabledata=aNDfr.MD.guids, returnRepeats=TRUE,
		returnSelectedGlomoRows=TRUE, verbosity=10)



	aNDfr.MD.predcond<-predict.conditional.allrows.GLoMo(glomo=aGLoMo.RF.N,
		nobs=a.nsamplesperrow, dfr=aNDfr.MD, forrows=seq(nrow(aNDfr.MD)),
		validateFunction=validateFunction.useprob, guiddata=aNDfr.MD.guids,
		otherData=rep(0.5, nrow(aNDfr.MD)), initialSuccessRateGuess=0.7, verbosity=20)
	str(aNDfr.MD.predcond)



	tmpeml<-EMLasso.1l.lognet.cv_parallel_1
	listOfGLoMos<-lapply(tmpeml$actualfits,
		function(curfit){curfit$fitinfo$glomo})

	class(listOfGLoMos[[1]])
	str(listOfGLoMos[[1]])

	tstglmo<-do.call(combineGLoMos, c(listOfGLoMos, verbosity=6))

}




if(FALSE)
{
	require(addendum)
	require(NumDfr)
	setwd("C:/users/nisabbe/Documents/My Dropbox/Doctoraat/Bayesian Lasso")
	load("C:\\Users\\nisabbe\\Documents\\My Dropbox\\Doctoraat\\Bayesian Lasso\\GLoMo.RData")
	source("GLoMo.R")
	save.image("C:\\Users\\nisabbe\\Documents\\My Dropbox\\Doctoraat\\Bayesian Lasso\\GLoMo.RData")
}

Try the GLoMo package in your browser

Any scripts or data that you put into this service are public.

GLoMo documentation built on May 2, 2019, 5:26 p.m.