Nothing
#' Fit EMLasso for 1 lambda
#'
#' Fit EMLasso for 1 lambda
#'
#' @aliases EMLasso.1l EMLasso1l EMLasso1l-class
#' @param ds dataset (\code{\link{numdfr}} or \code{\link{data.frame}}) to fit it to
#' @param out outcome vector
#' @param lambda the single lambda
#' @param nrOfSamplesPerMDRow For rows with missing data, how many rows to sample. Defaults to 10.
#' @param rowsToUseForFit Which of the rows of dfr/out can be used for fitting the LASSO (int vector). Defaults to all rows.
#' @param firstTimeCompleter object supported by \code{\link{completeMarginal}} or of similar form
#' that will complete the dataset the first time. Defaults to \code{\link{marginalCompleter}} with default parameters.
#' @param imputeDs2FitDsProperties see \code{\link{imputeDs2FitDs}} object that will provide the conversion from imputed
#' dataset to one that is ready for fitting the predictor model
#' @param fitPredictor function that will perform the predictor model fit. Defaults to \code{\link{GLoMo}} and should be of this signature.
#' @param family see \code{\link{glmnet}}. Defaults to "binomial" (i.e. lasso penalized logistic regression).
#' @param convergenceChecker function that will check based on the coefficients over the repeats whether convergence has been achieved.
#' Defaults to \code{\link{convergenceCheckCreator}()}, so a wrapper around \code{\link{checkConvergence.glmnet}}. The function should
#' have parameters \code{coefs} and \code{verbosity=0}
#' @param postProcess function, like \code{\link{postProcessEMLasso1l}} (its default) and of that signature, to do some extra
#' work before returning the result of \code{EMLasso.1l}
#' @param verbosity The higher this value, the more levels of progress and debug
#' information is displayed (note: in R for Windows, turn off buffered output)
#' @param extraLog A function (by default one that does nothing) that will be used in
#' each iteration to perform extra logging
#' @return An object of class EMLasso1l. This is a list with the following items:
#' \itemize{
#' \item \code{lasso.fit}: glmnet object
#' \item \code{predictorModel}: final predictor fit (typically of class GLoMo)
#' \item \code{coefs}: coefs for all iterations (rows) and dummycoded columnames (columns), incl. (intercept).
#' \item \code{ds}: original dataset passed along
#' \item \code{out}: outcome variable (1 for each row in ds)
#' \item \code{lambda}: 1 lambda value
#' \item \code{nrOfSamplesPerMDRow}: how many imputations per row with missing data
#' \item \code{convergence}: return value of last call to \code{convergenceChecker}. Will typically contain
#' information like \code{minIt} and \code{maxIt}.
#' \item \code{rowsToUseForFit}: which of the rows in ds was used to fit the lasso
#' \item \code{iterCount}: how many iterations occurred before convergence / maxIt
#' \item \code{imputeDs2FitDsProperties}: as passed in, but first fed to \code{\link{imputeDs2FitDsProps}}
#' \item any other items added by \code{postProcess}
#' }
#' @author Nick Sabbe \email{nick.sabbe@@ugent.be}
#' @references [PENDING]
#' @keywords EMLasso fit
#' @examples aDfr<-generateTypicalIndependentDfr(numCat=10, numCnt=10, numObs=100, catProbs=rep(1/3,3),
#' rcnt=typicalRandomNorm, doShuffle=TRUE, verbosity=1)
#'
#' outlins<- -mean(aDfr$cnt1)+aDfr$cnt1+2*(aDfr$cat1=="b")
#' outprobs<-expit(outlins)
#' y<-factor(sapply(outprobs, function(prob){sample(c("no", "yes"), 1, prob=c(1-prob,prob))}))
#'
#' aDfr.MD<-randomNA(aDfr, 0.01, verbosity=1)
#' rlh<-findReasonableLambdaHelper(aDfr, y, verbosity=10)
#' aLam<-rlh$regionDfr[2,"lambda"]
#' emlfit1<-EMLasso.1l(aDfr.MD, y, lambda=aLam, nrOfSamplesPerMDRow=7,
#' convergenceChecker=convergenceCheckCreator(minIt=5, maxIt=10), verbosity=10)
#' @export
EMLasso.1l<-function(ds, out, lambda, nrOfSamplesPerMDRow=10,
rowsToUseForFit=seq(nrow(ds)), firstTimeCompleter=marginalCompleter(),
imputeDs2FitDsProperties=normalImputationConversion(),
fitPredictor=GLoMo, #GLoMo(dfr, weights, verbosity)
family="binomial", convergenceChecker=convergenceCheckCreator(),
postProcess=postProcessEMLasso1l, verbosity=0, extraLog=function(...){})
{
if(family=="binomial")
{
if(! is.factor(out))
{
catw("Family is binomial, so converting the outcome to factor.")
out<-factor(out)
if(length(levels(out)) != 2)
{
stop("The outcome should have exactly 2 levels")
}
}
}
else if(family=="multinomial")
{
if(! is.factor(out))
{
catw("Family is binomial, so converting the outcome to factor.")
out<-factor(out)
}
}
if(is.data.frame(ds))
{
catwif(verbosity > 0, "ds was passed in as a data.frame. For performance reasons, it will now be converted to numdfr first!")
ds<-as.numdfr(ds)
}
imputeDs2FitDsProperties<-imputeDs2FitDsProps(object=imputeDs2FitDsProperties,ds=ds,verbosity=verbosity)
numObsToUse<-length(rowsToUseForFit)
numObs<-nrow(ds)
numVars<-ncol(ds)
levelslist<-allLevels(ds)
factCols<-findCatColNums(ds)
numFacts<-length(factCols)
catwif(verbosity > 0, numObsToUse, "/", numObs, "obs,", numFacts, "factor vars,", numVars, "vars total.")
#get an initial estimate of the GLoMo parameters:
#First, 'randomly' complete the dataset:
catwif(verbosity > 0, "random fill the dataset, dims before:", dim(ds))
imputedData<-completeMarginal(object=firstTimeCompleter,ds=ds, out=out, rowsToUse=rowsToUseForFit, verbosity=verbosity-2)
# #NSTMP
# d_imputedData<<-imputedData
# #NSTMP
catwif(verbosity > 0, "random fill the dataset, dims after:", dim(imputedData$ds))
#get an initial estimate of the lasso parameters
#only the rows that correspond to rowsToUseForFit are to be used
catwif(verbosity > 0, "First Lasso fit.") #takes _a lot_ of time
lasso.fit<-fit.glmnet(ds=imputedData$ds[imputedData$completedUse,], out=subsetFirstDim(imputedData$out,imputedData$completedUse),
lambda=lambda, weights=imputedData$weights[imputedData$completedUse],
verbosity=verbosity-1, imputeDs2FitDsProperties=imputeDs2FitDsProperties, family=family)
coefs<-coef(lasso.fit)
#apparently, this holds a sparse matrix with 1 column
coefs<-as.data.frame(as.matrix(t(coefs)))
# #NSTMP
# d_lasso.fit<<-lasso.fit
# #NSTMP
catwif(verbosity > 0, "First predictor model fit.")
#actually fit the predictor model on this adjusted dataset
#(would it make sense to calculate the uniqueIdentifiersPerRow up front? Probably not?)
predictorModel<-fitPredictor(dfr=imputedData$ds, weights=imputedData$weights, verbosity=verbosity-1)
# #NSTMP
# d_predictorModel<<-predictorModel
# #NSTMP
reusableForSampling<-predictorModelSamplingReusables(predictorModel=predictorModel, iterCount=0,
previousReusables=NULL, ds=ds, verbosity=verbosity-1)
extraLog(reusableForSampling)
# #NSTMP
# d_reusableForSampling<<-reusableForSampling
# #NSTMP
#Here starts the actual EM
convergence<-list(converged=FALSE) #make this similar in form to convergenceChecker result
iterCount<-0
while(! convergence$converged)
{
iterCount<-iterCount+1
catwif(verbosity > 1, "iteration", iterCount)
catwif(verbosity > 1, "->Generating missing data")
#first, fill out the missing data
reusableForvalidation<-outcomeModelValidationReusables(outcomeModel=lasso.fit, ds=ds, out=out,
imputeDs2FitDsProperties=imputeDs2FitDsProperties, verbosity=verbosity-2)
catwif(verbosity > 1, "->Before predict call")
# #NSTMP
# d_reusableForvalidation<<-reusableForvalidation
# #NSTMP
curData<-sampleConditional(predictorModel=predictorModel, outcomeModel=lasso.fit,
nrOfSamplesPerMDRow=nrOfSamplesPerMDRow, ds=ds, out=out, rowsToUseForFit=rowsToUseForFit,
reusableForSampling=reusableForSampling,
reusableForvalidation=reusableForvalidation, verbosity=verbosity-2)
# #NSTMP
# d_curData<<-curData
# #NSTMP
extraLog(curData, "Imputed data", ds, out)
reusableForSampling<-predictorModelSamplingReusables(predictorModel=predictorModel, iterCount=iterCount,
previousReusables=reusableForSampling, ds=ds, verbosity=verbosity-1)
extraLog(reusableForSampling)
# #NSTMP
# d_reusableForSampling2<<-reusableForSampling
# #NSTMP
#Given this data, fit
#a) The lasso
catwif(verbosity > 1, "->Fitting lasso")
lasso.fit<-fit.glmnet(ds=curData$ds[curData$useForFit,], out=curData$useOut,
lambda=lambda, weights=curData$weights[curData$useForFit],
verbosity=verbosity-2, imputeDs2FitDsProperties=imputeDs2FitDsProperties, family=family)
extraLog(lasso.fit)
# #NSTMP
# d_lasso.fit2<<-lasso.fit
# #NSTMP
#b) The Predictor Model
catwif(verbosity > 1, "->Fitting predictors model")
predictorModel<-fitPredictor(dfr=curData$ds, weights=curData$weights, verbosity=verbosity-2)
extraLog(predictorModel)
# #NSTMP
# d_predictorModel2<<-predictorModel
# #NSTMP
catwif(verbosity > 1, "->Checking convergence")
newcoefs<-coef(lasso.fit)
newcoefs<-as.data.frame(as.matrix(t(newcoefs)))
coefs<-rbind(coefs, newcoefs) #may not work if some factor levels are missing...
convergence<-convergenceChecker(coefs, verbosity = verbosity-2)
#converged<-checkConvergence.lognet(coefs, minIt, maxIt, verbosity = verbosity-2)
# #NSTMP
# d_convergence<<-convergence
# #NSTMP
if((verbosity > 1) & (convergence$converged)) catt("****Convergence in iteration", iterCount)
gc()
# #NSTMP
# catw("Will act like we converged anyway, to get things over with...")
# convergence$converged<-TRUE
# #NSTMP
}
retval<-list(
lasso.fit=lasso.fit,
predictorModel=predictorModel,
coefs=coefs,
ds=ds,
out=out,
lambda=lambda,
nrOfSamplesPerMDRow=nrOfSamplesPerMDRow,
convergence=convergence,
rowsToUseForFit=rowsToUseForFit,
iterCount=iterCount,
imputeDs2FitDsProperties=imputeDs2FitDsProperties)
class(retval)<-"EMLasso1l"
catwif(verbosity > 0, "Postprocessing")
retval<-postProcess(outcomeModel=lasso.fit, retval=retval, lastData=curData,
imputeDs2FitDsProperties=imputeDs2FitDsProperties, reusableForSampling=reusableForSampling,
reusableForvalidation=reusableForvalidation, verbosity=verbosity-1)
return(retval)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.