inst/deprecated/utilities.r

## Utilities
## Written by Jared P. Lander
## See LISCENSE for copyright information


## @modelFactorVars: (character vector) names of variables that are factors
## @modelModel: (model.matrix) the model.matrix from the model, I would like this to be changed to accept a smaller set of the model.matrix so there is less data being passed arounf
## @modelCoefs: (character vector) the coefficient names that will be matched and shortened
## @shorten:  (logical or character vector) if true all variables will be shortened, if false, none will be (to save computation), if a character vector, only the ones listed will be shortened, the other will remain (this may get VERY complicated
## @factors: (character vector) a list of vectors to work with if we are only interested in a few
## @only: (logical) if factors restricts what we are looking at then decide if we want just that variable or the stuff it interacts with too
## have to finish dealing with only showing some factors while also shortening some, all or none
##      should be done! Yay!
buildFactorDF <- function(modelFactorVars, modelModel, modelCoefs, shorten=TRUE, factors=NULL, only=NULL)
{
    # if we are only looking for some factors, just check those saving time on the rest
    # needs to be changed to work with exclude
#      if(!is.null(factors))
#      {
#          modelFactorVars <- factors
#      }
    
    # build a data.frame that matches each factor variable with it's levels
    varDFTemp <- adply(modelFactorVars, 1, function(x, modelD) { expand.grid(x, extractLevels(x, modelD), stringsAsFactors=FALSE) }, modelModel)	## Build a frame of the variables and the coefficient names for the factor variables
	names(varDFTemp)[2:3] <- c("Var", "Pivot")		## give good names to the frame

    # match each level to every coefficient (factor or numeric)
	varDF <- expand.grid(varDFTemp$Pivot, modelCoefs, stringsAsFactors=FALSE)
	names(varDF)[1:2] <- c("Pivot", "Coef")		## give good names to the frame

    # join the two data.frames so we have variable name, levels name and coefficient names
    varDF <- join(varDF, varDFTemp, by="Pivot")

    rm(varDFTemp); # housekeeping
	
    ## create columns to hold altered versions of the variable and pivot, it replaces special characters with their excaped versions
    varDF$PivotAlter <- varDF$Pivot
	varDF$VarAlter <- varDF$Var
	
    ## the special characters and their escaped equivalents
	specials <- c("!", "(", ")", "-", "=", ".")
	
    ## go through and do the replacing
	alterList <- subSpecials(varDF$VarAlter, varDF$PivotAlter, specialChars=specials)
    
    ## put the subbed values back
    varDF$VarAlter <- alterList[[1]]
    varDF$PivotAlter <- alterList[[2]]
    
    rm(alterList); # housekeeping
    

    # set up a column to keep track of which combinations are good
	varDF$Valid<- NA
	
    # the short names of the coefficients
	varDF$CoefShort <- NA
    
    # see if the coefficient is equivalent to the variable
    varDF <- ddply(.data=varDF, .variables="PivotAlter", .fun=function(DF) { DF$Valid=regexpr(unique(DF$PivotAlter), DF$Coef, ignore.case=FALSE); return(DF) })

    # just take the ones that match
    varDF <- varDF[varDF$Valid > 0, ]
    
    varDF$VarCheck <- varDF$VarAlter
    
    ## if a list of variables to shorten was given, then make sure that the other variable names won't be subbed out
    if(identical(class(shorten), "character"))      # maybe this should be class(shorten) == "character" instead to make things safer?
    {
        # make the ones that aren't listed "" so that they won't be subbed out
        varDF[!varDF$Var %in% shorten, "VarAlter"] <- ""
    }
    
    ## group the variable names to sub out with a "|" between each so any will be subbed out
    ## this now creates two variables like that
    ## Subbers is used for the coefficient shortening
    ## Checkers is used for narrowing down the the variables
    varDF <- ddply(varDF, .(Coef), function(vect, namer, checker, collapse, keepers) { vect$Subbers <- paste(vect[, c(namer)], collapse=collapse); vect$Checkers <- paste(unique(vect[, c(checker)]), collapse=collapse); return(vect[1, keepers]) }, namer="VarAlter", checker="VarCheck", collapse="|", keepers=c("Var", "Coef", "Subbers", "Checkers", "CoefShort"))
    #varDF <- ddply(varDF, .(Coef), function(vect, namer, checker, collapse, keepers) { vect$Subbers <- paste(vect[, c(namer)], collapse=collapse); vect$Checkers <- paste(vect[, c(checker)], collapse=collapse); return(vect[1, keepers]) }, namer="VarAlter", checker="VarCheck", collapse="|", keepers=c("Var", "Coef", "Subbers", "Checkers", "CoefShort"))

    ## if only certain factors are to be shown, narrow down the list to them
    if(!is.null(factors))
    {
    	theCheckers <- strsplit(x=varDF$Checkers, split="|", fixed=TRUE)
	 
    	## if they only want that variable and not it's interactions
		if(identical(only, TRUE))
		{
			varDF <- varDF[varDF$Checkers %in% factors, ]
		}else
		{
			## if any of the variables are in the factors to keep then keep it
            ######################
            ## need to adjust it so the user can specify just the interaction of factors, and not the factors themselves
			theKeepers <- laply(theCheckers, function(x, toCheck) { any(x %in% toCheck) }, toCheck=factors)
			varDF <- varDF[theKeepers, ]
			rm(theKeepers);
		}
		
		rm(theCheckers);
    }
    
    # if we are not supposed to shorten the coefficients at all (shorten==FALSE) then just swap Coef into CoefShort
    # this can be done so that the caller function just grabs Coef instead of CoefShort
    # would be nice if this can be done higher up so not as much processing needs to be done
    if(identical(shorten, FALSE))
    {
        varDF$CoefShort <- varDF$Coef
        return(varDF[, c("Var", "Checkers", "Coef", "CoefShort")])
    }
    
    # now sub out the subbers from the coef to make coef short
    varDF <- ddply(varDF, .(Subbers), function(DF) { DF$CoefShort <- gsub(paste("(^|:)", "(", unique(DF$Subbers), ")", sep=""), "\\1", DF$Coef); return(DF) } )

    # return the results
	return(varDF[, c("Var", "Checkers", "Coef", "CoefShort")])
}


## get the levels of factor variables
## @varName:  the variable to pay attention to
## @modelModel:  the model of interest
extractLevels <- function(varName, modelModel)
{
    # put the variable name in front of the level
    paste(varName, levels(factor(modelModel[[varName]])), sep="")
}

# have to get the levels from the coef names
# factors and only not needed because that is already taken care of previously
## ALMOST IDENTICAL to buildFactorDF except in the way it does expand.grid
##          would be awesome to combine the two
rxVarMatcher <- function(modelFactorVars, modelCoefNames, modelCoefs, shorten=TRUE, factors=NULL, only=NULL)
{
    ## match a factor var to those that are just that var or interaction with a numeric
    ## do it for each factor var
    # then will find combinations
    # put each factor var with each coefficient
    varDF <- expand.grid(modelFactorVars, modelCoefNames, stringsAsFactors=FALSE)
    names(varDF) <- c("Var", "Coef")
    
    ## check if the variable matches the coef it is paired with
    # create columns to hold altered versions of the variable and pivot, it replaces special characters with their excaped versions
    varDF$VarAlter <- varDF$Var
	
    ## the special characters and their escaped equivalents
	specials <- c("!", "(", ")", "-", "=", ".")
	
    # go through and do the replacing
	alterList <- subSpecials(varDF$VarAlter, specialChars=specials)
    
    # put the subbed values back
    varDF$VarAlter <- alterList[[1]]
    varDF$VarCheck <- varDF$VarAlter
    varDF$CoefShort <- NA
    
    rm(alterList);   # housekeeping

    # now check VarAlter against coef
    varDF <- ddply(varDF, .variables="Var", .fun=function(DF) { DF$Valid <- regexpr(pattern=paste("(^| for |, )(", unique(DF$VarAlter), ")=", sep=""), text=DF$Coef); return(DF) })

    # only keep the valid ones
    varDF <- varDF[varDF$Valid != -1, ]
    
    ## if a list of variables to shorten was given, then make sure that the other variable names won't be subbed out
    if(identical(class(shorten), "character"))      # maybe this should be class(shorten) == "character" instead to make things safer?
    {
        # make the ones that aren't listed "" so that they won't be subbed out
        varDF[!varDF$Var %in% shorten, "VarAlter"] <- ""
    }
    
    ## group the variable names to sub out with a "|" between each so any will be subbed out
    ## this now creates two variables like that
    ## Subbers is used for the coefficient shortening
    ## Checkers is used for narrowing down the the variables
    varDF <- ddply(varDF, .(Coef), function(vect, namer, checker, collapse, keepers) { vect$Subbers <- paste(vect[, c(namer)], collapse=collapse); vect$Checkers <- paste(vect[, c(checker)], collapse=collapse); return(vect[1, keepers]) }, namer="VarAlter", checker="VarCheck", collapse="|", keepers=c("Var", "Coef", "Subbers", "Checkers", "CoefShort"))
    
    # if we are not supposed to shorten the coefficients at all (shorten==FALSE) then just swap Coef into CoefShort
    # this can be done so that the caller function just grabs Coef instead of CoefShort
    # would be nice if this can be done higher up so not as much processing needs to be done
    if(identical(shorten, FALSE))
    {
        varDF$CoefShort <- varDF$Coef
        return(varDF[, c("Var", "Checkers", "Coef", "CoefShort")])
    }

    # do the shortening
    varDF <- ddply(varDF, .(Subbers), function(DF) { DF$CoefShort <- gsub(paste("(^|, | for )", "(", unique(DF$Subbers), ")=", sep=""), "\\1", DF$Coef); return(DF) } )

    # return the results
    return(varDF[, c("Var", "Checkers", "Coef", "CoefShort")])
    
    #return(varDF)
}

#' Build data.frame for plotting
#'
#' Builds a data.frame that is appropriate for plotting coefficients
#'
#' This is the workhorse for coefplot, it get's the data all prepared
#'
#' \code{factors} Vector of factor variables that will be the only ones shown
#'
#' \code{only} logical; If factors has a value this determines how interactions are treated.  True means just that variable will be shown and not its interactions.  False means interactions will be included.
#'
#' \code{shorten} logical or character; If \code{FALSE} then coefficients for factor levels will include their variable name.  If \code{TRUE} coefficients for factor levels will be stripped of their variable names.  If a character vector of variables only coefficients for factor levels associated with those variables will the variable names stripped.
#'
#' @aliases buildModelCI
#' @author Jared P. Lander www.jaredlander.com
#' @param model The fitted model to build information on
#' @param innerCI How wide the inner confidence interval should be, normally 1 standard deviation.  If 0, then there will be no inner confidence interval.
#' @param outerCI How wide the outer confidence interval should be, normally 2 standard deviations.  If 0, then there will be no outer confidence interval.
#' @param sort Determines the sort order of the coefficients.  Possible values are c("natural", "normal", "magnitude", "size", "alphabetical")
#' @param decreasing logical; Whether the coefficients should be ascending or descending
#' @param numeric logical; If true and factors has exactly one value, then it is displayed in a horizontal graph with continuous confidence bounds.
#' @param intercept logical; Whether the Intercept coefficient should be plotted
#' @param \dots See Details for information on \code{factors}, \code{only} and \code{shorten}
#' @param name A name for the model, if NULL the call will be used
## @param multi logical, If \code{TRUE} a column is added denoting which model the modelCI is for
## @param factors Vector of factor variables that will be the only ones shown
## @param only logical; If factors has a value this determines how interactions are treated.  True means just that variable will be shown and not its interactions.  False means interactions will be included.
## @param shorten logical or character; If \code{FALSE} then coefficients for factor levels will include their variable name.  If \code{TRUE} coefficients for factor levels will be stripped of their variable names.  If a character vector of variables only coefficients for factor levels associated with those variables will the variable names stripped.
#' @return Otherwise a \code{\link{data.frame}} listing coeffcients and confidence bands is returned.
#' @seealso \code{\link{coefplot}} \code{\link{multiplot}}
#' @examples
#'
#' data(diamonds)
#' model1 <- lm(price ~ carat + cut, data=diamonds)
#' model2 <- lm(price ~ carat, data=diamonds)
#' model3 <- lm(price ~ carat + cut + color, data=diamonds)
#' coefplot:::buildModelCI(model1)
#' #coefplot(model1)
#' #coefplot(model2)
#' #coefplot(model3)
#' #coefplot(model3, factors="cut")
#' #coefplot(model3, factors="cut", numeric=T)
#' #coefplot(model3, shorten="cut")
#'
buildModelCI <- function(model, outerCI=2, innerCI=1, intercept=TRUE, numeric=FALSE, sort=c("natural", "normal", "magnitude", "size", "alphabetical"), decreasing=TRUE, name=NULL, ...)
{
    # get variables that have multiple options
    sort <- match.arg(sort)
    
    # get the information on the model
    modelInfo <- getModelInfo(model, ...)
    
	# get the coef and SE from modelInfo
	modelCoef <- modelInfo$coef             # the coefficients
	modelSE <- modelInfo$SE                 # the standard errors
 	modelMatched <- modelInfo$matchedVars   # the data.frame matching coefficients to variables

	# all the info about the coefficients
	modelCI <- data.frame(LowOuter=modelCoef - outerCI*modelSE, HighOuter=modelCoef + outerCI*modelSE, LowInner=modelCoef - innerCI*modelSE, HighInner=modelCoef + innerCI*modelSE, Coef=modelCoef) # build a data.frame of the confidence bounds and original coefficients
    names(modelCI) <- c("LowOuter", "HighOuter", "LowInner", "HighInner", "Coef")

	modelCI$Name <- rownames(modelCI)	## grab the coefficient names into the data.frame
 
	## join the factor coefficient info to the data.frame holding the coefficient info
	modelMatcher <- modelMatched[, c("Checkers", "Coef", "CoefShort")]
	names(modelMatcher)[2] <- "Name"
	modelMatcher$Name <- as.character(modelMatcher$Name)
	modelCI <- join(modelCI, modelMatcher, by="Name")
	
	rm(modelMatcher);		# housekeeping
	
	# since we will be using coef short for the coefficient labels the numeric variables need to be given CoefShort elements which will be taken from the Name column
	modelCI$CoefShort <- ifelse(is.na(modelCI$CoefShort), modelCI$Name, modelCI$CoefShort)
	
	# Similar for the Checkers column

	modelCI$Checkers <- ifelse(is.na(modelCI$Checkers), "Numeric", modelCI$Checkers)

	## if the intercept is not to be shown, then remove it
	if(intercept == FALSE | numeric)		## remove the intercept if so desired
	{
		theIntercept <- which(modelCI$Name == "(Intercept)")	# find the variable that is the intercept
		# make sure the intercept is actually present, if so, remove it
		if(length(theIntercept) > 0)
		{
			# remove the intercept
			modelCI <- modelCI[-theIntercept, ]
		}
  		rm(theIntercept);		# housekeeping
	}
	
	# if there are no good coefficients, then stop
	if(nrow(modelCI) == 0)
	{
		stop("There are no valid coeficients to plot", call.=FALSE)
	}
  
	## possible orderings of the coefficients
	ordering <- switch(sort,
							natural=order(1:nrow(modelCI), decreasing=decreasing), 	# the way the data came in
							normal=order(1:nrow(modelCI), decreasing=decreasing),	# the way the data came in
							nat=order(1:nrow(modelCI), decreasing=decreasing), 			# the way the data came in
							magnitude=order(modelCI$Coef, decreasing=decreasing), 		#  size order
							mag=order(modelCI$Coef, decreasing=decreasing), 			# size order
							size=order(modelCI$Coef, decreasing=decreasing),			# size order
							alphabetical=order(modelCI$Name, decreasing=decreasing), 	# alphabetical order
							alpha=order(modelCI$Name, decreasing=decreasing),			# alphabetical order
							order(1:nrow(modelCI))		# default, the way it came in
					)
	
	# implement the ordering
	modelCI <- modelCI[ordering, ]
	#return(modelCI)
	#return(modelCI$Name)
	modelCI$CoefShort <- factor(modelCI$CoefShort, levels=modelCI$CoefShort)
    
    # if a name for the model is provided, use it, otherwise use the call
    if(is.null(name))
    {
        modelCI$Model <- as.character(paste(model$call, collapse="_"))
    }else
    {
        modelCI$Model <- name
    }
    
    # convert the pipe in Checkers to a * for better display
    modelCI$Checkers <- gsub("\\|", ":", modelCI$Checkers)
    
    # return the data.frame
    return(modelCI)
}



#buildPlotting <-function()

#' Coefplot plotting
#'
#' Build ggplot object for coefplot
#'
#' This function builds up the ggplot layer by layer for \code{\link{coefplot.lm}}
#'
#' @author Jared P. Lander www.jaredlander.com
#' @seealso \code{\link{coefplot.lm}} \code{\link{coefplot}} \code{\link{multiplot}}
#' @aliases buildPlotting.lm
#' @param modelCI An object created by \code{\link{buildModelCI}}
#' @param modelMeltInner The inner SE part of the object built by \code{\link{meltModelCI}}
#' @param modelMeltOuter The outer SE part of the object built by \code{\link{meltModelCI}}
#' @param title The name of the plot, if NULL then no name is given
#' @param xlab The x label
#' @param ylab The y label
#' @param innerCI How wide the inner confidence interval should be, normally 1 standard deviation.  If 0, then there will be no inner confidence interval.
#' @param outerCI How wide the outer confidence interval should be, normally 2 standard deviations.  If 0, then there will be no outer confidence interval.
#' @param multi logical; If this is for \code{\link{multiplot}} then dodge the geoms
#' @param lwdInner The thickness of the inner confidence interval
#' @param lwdOuter The thickness of the outer confidence interval
#' @param color The color of the points and lines
#' @param cex The text size multiplier, currently not used
#' @param textAngle The angle for the coefficient labels, 0 is horizontal
#' @param numberAngle The angle for the value labels, 0 is horizontal
#' @param zeroColor The color of the line indicating 0
#' @param zeroLWD The thickness of the 0 line
#' @param zeroType The type of 0 line, 0 will mean no line
#' @param facet logical; If the coefficients should be faceted by the variables, numeric coefficients (including the intercept) will be one facet
#' @param scales The way the axes should be treated in a faceted plot.  Can be c("fixed", "free", "free_x", "free_y")
#' @param numeric logical; If true and factors has exactly one value, then it is displayed in a horizontal graph with continuous confidence bounds.
#' @param fillColor The color of the confidence bounds for a numeric factor
#' @param alpha The transparency level of the numeric factor's confidence bound
#' @param horizontal logical; If the plot should be displayed horizontally
#' @return a ggplot graph object
#' @examples
#'
#' data(diamonds)
#' model1 <- lm(price ~ carat + cut, data=diamonds)
#' theCI <- coefplot:::buildModelCI(model1)
#' theCIMelt <- coefplot:::meltModelCI(theCI)
#' coefplot:::buildPlotting.lm(theCI, theCIMelt$modelMeltInner, theCIMelt$modelMeltInner)
#'
buildPlotting.lm <- function(modelCI, 
                        modelMeltInner=NULL, modelMeltOuter=NULL, 
                        title="Coefficient Plot", xlab="Value", ylab="Coefficient",
                        lwdInner=1, lwdOuter=0, color="blue",
    					cex=.8, textAngle=0, numberAngle=0, outerCI=2, innerCI=1, multi=FALSE,
						zeroColor="grey", zeroLWD=1, zeroType=2, numeric=FALSE, fillColor="grey", alpha=1/2,
    					horizontal=FALSE, facet=FALSE, scales="free")
{
    ## build the layer infos
    # outerCI layer
    # first is for a normal coefplot or a faceted multiplot
    # the second is for a single-pane multiplot

	outerCIGeom <- list(DisplayOne=geom_line(aes(y=CoefShort, x=value, group=CoefShort), data=modelMeltOuter, colour=color, lwd=lwdOuter),
                        DisplayMany=geom_linerange(aes(ymin=LowOuter, ymax=HighOuter, colour=as.factor(Model)), data=modelCI, lwd=lwdOuter, position=position_dodge(width=1)),
                        None=NULL)
	# innerCI layer
    # first is for a normal coefplot or a faceted multiplot
    # the second is for a single-pane multiplot
	innerCIGeom <- list(DisplayOne=geom_line(aes(y=CoefShort, x=value, group=CoefShort), data=modelMeltInner, colour=color, lwd=lwdInner),
                        DisplayMany=geom_linerange(aes(ymin=LowInner, ymax=HighInner, colour=as.factor(Model)), data=modelCI, lwd=lwdInner, position=position_dodge(width=1)),
                        None=NULL)
	# ribbon layer
	ribbonGeom <- list(None=NULL, geom_ribbon(aes(ymin=LowOuter, ymax=HighOuter, group=Checkers), data=modelCI, fill=fillColor, alpha=alpha, lwd=lwdOuter))

    # point layer
    # first is for a normal coefplot or a faceted multiplot
    # the second is for a single-pane multiplot
    pointGeom <- list(DisplayOne=geom_point(colour=color),
                      DisplayMany=geom_point(position=position_dodge(width=1), aes(ymax=Coef, colour=as.factor(Model))),
                      None=NULL)
    
	# faceting info
	faceting <- list(None=NULL, Display=facet_wrap(~Checkers, scales=scales))
    
    if(numeric)
	{
        # numeric (sideways) plot
		p <- ggplot(data=modelCI, aes(y=Coef, x=CoefShort))			# the basics of the plot
		p <- p + geom_hline(yintercept=0, colour=zeroColor, linetype=zeroType, lwd=zeroLWD)		# the zero line
		p <- p + ribbonGeom[[numeric + 1]]		# the ribbon
		p <- p + geom_point(colour=color)						# the points
		p <- p + geom_line(data=modelCI, aes(y=HighOuter, x=CoefShort, group=Checkers), colour=color) +
			geom_line(data=modelCI, aes(y=LowOuter, x=CoefShort, group=Checkers), colour=color)
	}else if(multi)
    {
        # for a multiplot that plots everything in one panel with dodged geoms
    	p <- ggplot(data=modelCI, aes(y=Coef, x=CoefShort))			# the basics of the plot
    	p <- p + geom_hline(yintercept=0, colour=zeroColor, linetype=zeroType, lwd=zeroLWD)		# the zero line
    	p <- p + outerCIGeom[[(outerCI/outerCI) + multi]] +					# the outer CI bars
    		    innerCIGeom[[innerCI/innerCI + multi]]						# the inner CI bars
       	p <- p + pointGeom[[1 + multi]]						# the points
        p <- p + scale_x_discrete()
     	p <- p + theme(axis.text.y=element_text(angle=textAngle), axis.text.x=element_text(angle=numberAngle)) + labs(title=title, y=xlab, x=ylab)	# labeling and text info
     	p <- p + faceting[[facet + 1]]		# faceting
     	p <- p + if(!horizontal) coord_flip()
    }else
    {
        # for a regular coefplot or a multiplot in seperate facets
        p <- ggplot(data=modelCI, aes(x=Coef, y=CoefShort))    		# the basics of the plot
    	p <- p + geom_vline(xintercept=0, colour=zeroColor, linetype=zeroType, lwd=zeroLWD)		# the zero line
        p <- p + outerCIGeom[[(outerCI/outerCI)]] +    				# the outer CI bars
    		    innerCIGeom[[innerCI/innerCI]]						# the inner CI bars
       	p <- p + pointGeom[[1]]						# the points
        p <- p + theme(axis.text.y=element_text(angle=textAngle), axis.text.x=element_text(angle=numberAngle)) + labs(title=title, x=xlab, y=ylab)    # labeling and text info
        p <- p + faceting[[facet + 1]]    	# faceting
     	p <- p + if(horizontal) coord_flip()
    }
	rm(modelCI);		# housekeeping
    
	return(p)		# return the ggplot object
}
jaredlander/coefplot documentation built on March 26, 2022, 5:08 p.m.