# Copyright (c) 2010-2014 Nokome Bentley, Trophia Ltd
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
# BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' A package for generating step plots, influence plots, CDI plots, and influence metrics for a linear model
#'
#' The concept of influence in generalised linear models is described in
#' Bentley, N., Kendrick, T. H., Starr, P. J., & Breen, P. A. (2011). Influence plots and metrics: tools for better understanding fisheries catch-per-unit-effort standardisations.
#' This package provides an implementation of the plots and metrics described in that paper. ICES Journal of Marine Science, doi:10.1093/icesjms/fsr174.
#'
#' Currently, this package works for \code{glm} models with log transformed dependent variables. These
#' are the type of models commonly used for one part of the delta-lognormal approach to catch-per-unit-effort (CPUE)
#' standardisation. e.g.\code{ model = glm(log(catch)~year+month+vessel+effort)}
#' In time, the package may be generalised to other types of models (e.g. \code{gam}s) and other types of dependent variables.
#'
#' This document provides minimal documentation and the best way to learn how to use this package is through the vignette:
#' "An example of how to use the 'influ' package"
#'
#' @name influ-package
#' @aliases influ
#' @title Influence in linear models
#' @author Nokome Bentley
#' @import proto
#' @export Influence
NULL
#' The influence prototype object.
#'
#' @seealso Influence$new
#' @import proto
#' @export
#'
Influence = proto()
#' Create a new Influence object.
#'
#' A new Influence object needs to be created for each combination of a model and focus term.
#' For example, you might compare the inflence of the same term in two separate models:\code{
#' influ1 = Influence$new(model1,'year')
#' influ2 = Influence$new(model2,'year')
#' }
#'
#' @name Influence$new
#' @param . Not sure how this works
#' @param model The model for which the influence of terms will be determined
#' @param data The data which was used to fit the model (required for some types of models that do not store the data)
#' @param response The response term in the model
#' @param focus The focus term for which influence is to be calculated.
#' @return A new Influence object
#'
Influence$new <- function(., model, data = NULL, response = NULL, focus = NULL) {
instance = .$proto(
model = model,
data = data,
response = response,
focus = focus
)
instance$init()
instance
}
#' Initialise the Influence object
#'
#' This method should not need to be called directly. It is called from the \code{$new} constructor method.
#'
#' @name Influence$init
#' @return None
#'
Influence$init <- function(.) {
# If no data was supplied....
if (is.null(.$data)) {
# See if the model has a data variable
.$data = .$model$data
if (is.null(.$data)) {
# See if it is available as a model attribute
.$data = attr(.$model, "data")
}
if(is.null(.$data)){
stop("You need to provide the data that was fitted to for this type of model e. Influence$new(model,data,...)")
}
}
.$terms = attr(.$model$terms,"term.labels")
if(is.null(.$response)) .$response = names(.$model$model)[1]
if(is.null(.$focus)) .$focus = .$terms[1]
.$labels = list()
for(term in .$terms) .$labels[term] = term
.$orders = list()
for(term in .$terms) .$orders[term] = 'asis'
#An alternative, but potentially buggy, way of getting response term
#Obtain the response name. This seems the safest way, albeit long winded.
#variablesList = attr(model$terms,"variables") #List of variables including the response
#responseIndex = attr(model$terms,"response") #Index of the response variable in variablesList
#responseName = as.character(variablesList)[responseIndex+1]
}
#' Obtain the coefficients associated with a particular term in the model
#'
#' Extracts the coefficients
#'
#' @name coeffs
#' @param . The model to extract coefficients from
#' @param model The model to extract coefficients from
#' @param term The term in the model for which coefficients are extracted
#' @return A vector of coefficients
#' @importFrom stats coefficients
#' @export
#'
coeffs <- function(., model = .$model, term = .$focus) {
coeffs <- coefficients(model)
rows <- substr(names(coeffs), 1, nchar(term)) == term
c(0, coeffs[rows])
}
#' Obtain standard errors for the coefficients associated with a particular term in the model
#'
#' Extract standard errors using the method of Francis
#'
#' @name ses
#' @param . The model to extract standard errors for a coefficient from
#' @param model The model to extract standard errors for a coefficient from
#' @param term The term in the model for which coefficients SEs are extracted
#' @export
#'
ses <- function(., model = .$model, term = .$focus) {
type = class(model)[1]
if (type == 'glm' | type == 'negbin') {
#The "cov.scaled" member of a glm object is the estimated covariance matrix of
#the estimated coefficients scaled by dispersion"
V = summary(model)$cov.scaled
}
else if (type == 'survreg') {
#The member "var" for a survreg object is the "the variance-covariance matrix for the parameters,
#including the log(scale) parameter(s)"
V = model$var
}
rows = substr(row.names(V), 1, nchar(term)) == term
V = V[rows,rows]
n = sum(rows) + 1
Q = matrix(-1/n, nrow = n, ncol = n - 1)
Q[-1,] = Q[-1,] + diag(rep(1,n-1))
V0 = (Q%*%V) %*% t(Q)
se = sqrt(diag(V0))
se
}
#' Obtain the effects associated with a particular term in the model
#'
#' @name Influence$effects
#' @param model The model to extract effects from
#' @param term The term in the model for which effects are extracted
#' @return A numeric vector of effects for the model term
#'
Influence$effects = function(., model = .$model, term = .$focus){
coeffs = .$coeffs(model,term)
exp(coeffs - mean(coeffs))
}
#' Perform necessary calculations
#'
#' This method must be called for the \code{$summary} data frame to be available or before any plots (\code{$stanPlot(),$stepPlot(),$influPlot(),$cdiPlot()}) can be generated from the Influence object
#'
#' @name Influence$calc
#' @return None
#'
Influence$calc <- function(.){
#Get observed values
observed = .$model$model[,.$response]
if (class(observed)[1] == 'Surv') observed = as.numeric(observed[,1])
logged = substr(.$response,1,4) == 'log('
#Create a data frame that is used to store various values for each level of focal term
.$indices = data.frame(level=levels(.$model$model[,.$focus]))
#Add an unstandardised index
if (logged | sum(observed<=0)==0){
if (logged) log_observed = observed else log_observed = log(observed)
# Calculate geometic mean
.$indices = merge(.$indices,aggregate(list(unstan=log_observed),list(level=.$model$model[,.$focus]),mean))
# Turn into a relative index
.$indices$unstan = with(.$indices,exp(unstan-mean(unstan)))
} else {
# There are zeros (or even negative numbers) in the data so we cant calculate a geometric mean.
# Use arithmetic mean instead
.$indices = merge(.$indices,aggregate(list(unstan=observed),list(level=.$model$model[,.$focus]),mean))
# Turn into a relative index
.$indices$unstan = with(.$indices,unstan/mean(unstan))
}
#Add standardised index.
coeffs = .$coeffs()
ses = .$ses()
base = mean(coeffs)
.$indices$stan = exp(coeffs-base)
.$indices$stanLower = exp(coeffs-base-2*ses)
.$indices$stanUpper = exp(coeffs-base+2*ses)
#Create models with terms succesively added
.$summary = NULL
#TODO calculate influence statistics in this loop too
for (termCount in 0:length(.$terms)) {
if (termCount > 0) {
term = .$terms[termCount]
#Update both formula and model
newFormula = formula(.$model)
newFormula = update.formula(newFormula,formula(paste("~",paste(paste(.$terms[1:termCount],collapse='+')))))
model = update(.$model,newFormula)
#Get index for this model
index = .$effects(model)
#Add column to .$indices
.$indices = cbind(.$indices,index)
#Give index column the right hand side of formula as name
names(.$indices)[ncol(.$indices)] = if(termCount==1) term else paste('+',term)
} else {
term = 'intercept'
model = update(.$model,.~1)
}
type = class(model)[1]
logLike = switch(type,
survreg = model$loglik[2],
logLik(model)
)
fitted = switch(type,
survreg = predict(model,type='response'),
fitted(model)
)
#Sums-of-squared based R2 based on log(observed) and log(fitted)
if(termCount==0) r2 = 0 else r2 = cor(log(observed),log(fitted))^2
#Deviance pseudo-R2
r2Dev = (model$null.deviance-model$deviance)/model$null.deviance
if(length(r2Dev)==0) r2Dev = NA
#Negelkerke pseudo-R2
if(termCount==0) logLikeInterceptOnly = logLike
n = length(observed)
r2Negel = (1-exp((logLikeInterceptOnly-logLike)*(2/n)))/(1-exp(logLikeInterceptOnly*(2/n)))
.$summary = rbind(.$summary,data.frame(
term = term,
k = length(coef(model)),
logLike = logLike,
aic = extractAIC(model)[2],
r2 = r2,
r2Dev = r2Dev,
r2Negel = r2Negel
))
}
#R2 values presented as differences
.$summary = within(.$summary,{
k = c(1,diff(k))
r2 = c(NA,diff(r2))
r2Dev = c(NA,diff(r2Dev))
r2Negel = c(NA,diff(r2Negel))
})
#Calculate predicted values
preds = predict(.$model,type='terms',se.fit=T)
fit = as.data.frame(preds$fit)
se.fit = as.data.frame(preds$se.fit)
preds = cbind(fit,se.fit)
names(preds) = c(paste('fit',names(fit),sep='.'),paste('se.fit',names(fit),sep='.'))
.$preds = cbind(.$data,preds)
#Calculate influences and statisitcs
.$influences = data.frame(level=levels(.$model$model[,.$focus]))
overall = c(NA,NA) # NAs for null model and for focus term
trend = c(NA,NA)
for(term in .$terms){
if(term!=.$focus){
infl = aggregate(
list(value = .$preds[,paste('fit',term,sep='.')]),
list(level = .$preds[,.$focus]),
mean
)
overall = c(overall,with(infl,exp(mean(abs(value)))-1))
trend = c(trend,with(infl,exp(cov(1:length(value),value)/var(1:length(value)))-1))
names(infl) = c('level',term)
.$influences = merge(.$influences,infl,all.x=T,by='level')
}
}
#Insert statistics into summary table with NAs for NULL and focus terms
.$summary$overall = overall
.$summary$trend = trend
}
#' Standardization plot
#'
#' This plot simply compares the standardised and unstandardised indices from a model so that the overall standardisation effect can be ascertained.
#'
#' @name Influence$stanPlot
#' @return None
#'
Influence$stanPlot <- function(.){
with(.$indices,{
plot(NA,ylim=c(0,max(unstan,stanUpper)),xlim=c(1,length(level)),las=1,ylab='Index',xlab=.$labels[[.$focus]],xaxt='n')
lines(as.integer(level),unstan,type='o',pch=1,cex=1.25)
lines(as.integer(level),stan,type='o',pch=16,cex=1.25)
segments(as.integer(level),stanLower,as.integer(level),stanUpper)
axis(side=1,at=1:length(levels(level)),labels=levels(level))
legend('top',legend=c('Unstandardised','Standardised'),pch=c(1,16),pt.cex=1.25,bty='n')
})
}
#' Step plot
#'
#' A plot of the standardised indices as each explanatory variable is added to the model (in the order that they are specified in the model equation)
#'
#' @name Influence$stepPlot
#' @param panels Whether or not to produce a "progressive" step plot (default) in which each step is represented by a separate
#' panel with the previous steps shown as a dashed line and other steps shown as dashed grey lines. If FALSE then a single panel
#' plot is produced with symbols for each step
#' @return None
#'
Influence$stepPlot <- function(.,panels=T,setpar=T){
startCol = 6
cols = startCol:ncol(.$indices)
if(panels){
if(setpar)par(mfrow=c(length(cols),1),mar=c(0,4,0,0),oma=c(4,1,1,1))
levels = as.integer(.$indices$level)
xlim=c(1,length(levels))
ylim=c(0,max(.$indices[,cols]))
for(col in cols){
plot(NA,xaxt='n',las=1,ylab='Index',ylim=ylim,xlim=xlim)
#abline(h=1,lty=2)
for(prev in cols[1]:col){
if(prev<col-1) lines(levels,.$indices[,prev],col='grey',lwd=1.5)
if(prev==col-1) lines(levels,.$indices[,prev],lty=3,col='black',lwd=1.5)
if(prev==col) points(levels,.$indices[,prev],pch=16,cex=1.25,col='black',type='o')
}
legend('topleft',legend=names(.$indices)[col],bty='n')
}
axis(side=1,at=1:length(.$indices$level),labels=.$indices$level)
}
else {
with(.$indices,{
plot(NA,ylim=c(0,max(.$indices[,cols])),xlim=c(1,nrow(.$indices)),las=1,ylab='Index',xlab=.$labels[[.$focus]],xaxt='n')
for(col in cols) lines(as.integer(level),.$indices[,col],type='o',pch=col-startCol+1,cex=1.25)
axis(side=1,at=1:length(level),labels=level)
legend('top',legend=names(.$indices)[cols],pch=cols-startCol+1,pt.cex=1.25,bty='n')
})
}
}
#' Influence plot
#'
#' A plot of the influence of each explanatory variable in the model
#'
#' @name Influence$influPlot
#' @param panels Whether or not to produce a plot with separate panels for each variable
#' @return None
#'
Influence$influPlot <- function(.,panels=T,setpar=T){
cols = 2:ncol(.$influences)
ylim=exp(range(.$influences[,cols]))
xlim=c(1,nrow(.$influences))
if(panels){
if(setpar) par(mfrow=c(length(cols),1),mar=c(0,4,0,0),oma=c(4,1,1,1))
levels = as.integer(.$influences$level)
for(col in cols){
plot(levels,exp(.$influences[,col]),pch=16,cex=1.25,col='black',type='o',xaxt='n',las=1,ylab='Influence',ylim=ylim,xlim=xlim)
abline(h=1,lty=2)
legend('topleft',legend=names(.$influences)[col],bty='n')
}
axis(side=1,at=1:length(.$influences$level),labels=.$influences$level)
}
else{
with(.$influences,{
plot(NA,ylim=ylim,xlim=xlim,las=1,ylab='Influence',xlab=.$labels[[.$focus]],xaxt='n')
for(col in cols) lines(as.integer(level),exp(.$influences[,col]),type='o',pch=col,cex=1.25)
axis(side=1,at=1:length(level),labels=level)
legend('top',legend=names(.$influences)[cols],pch=cols,pt.cex=1.25,bty='n')
abline(h=1,lty=2)
})
}
}
#' Step and influence plots
#'
#' Side by side, panellised step and influence plots
#'
#' @name Influence$stepAndInfluPlot
#' @return None
#'
Influence$stepAndInfluPlot <- function(.){
par(mfcol=c(ncol(.$indices)-5,2),mar=c(0,5,0,0),oma=c(4,1,1,1))
.$stepPlot(setpar=F)
#Create a blank plot in the top of the influences column for the row for the focus term.
plot.new()
.$influPlot(setpar=F)
}
#' A coefficient-distribution-influence (CDI) plot for a model term
#'
#' @name Influence$cdiPlot
#' @param term The model term for which the CDI plot should be generated
#' @param variable Optional. In some cases it may be necessary to specifiy the variable that is related to the term. For example, for the term \code{"log(effort)"} the variable is \code{"effort"}.
#' In most cases this can be deduced from the term, but for complexes terms this argumey may also need to be supplied (an error message will tell you if it does).
#' @return None
#' @seealso cdiPlotAll
#'
Influence$cdiPlot <- function(.,term,variable=NULL){
par(oma=c(1,1,1,1),cex.lab=1.25,cex.axis=1.25)
layout(matrix(c(1,1,0,2,2,3,2,2,3),3,3,byrow=TRUE))
#Define levels of term on which coefficient and distribution plots will be based
#This is done by search for each column name in the data as a whole word in the
#each term. This allows for matching of terms like 'poly(log(var),3)' with 'var'
if(is.null(variable)){
for(name in names(.$preds)){
match = grep(paste('([^[:alnum:]_])+',name,'([^[:alnum:]_])+',sep=''),paste('(',term,')')) # ([^\\w])+ Means ignore any 'word' character (alphanumerics plus underscore)
if(length(match)>0){
variable = name
break
}
}
}
if(is.null(variable)) stop('Unable to find a matching variable for term "',term,'". Please specify in the argument "variable".')
levels = .$preds[,variable]
#Numeric terms are cut into factors
if(is.numeric(levels)){
breaks = pretty(levels,30)
step = breaks[2]-breaks[1]
labels = breaks+step/2
breaks = c(breaks,breaks[length(breaks)]+step)
levels = cut(levels,breaks,labels=labels,include.lowest=T)
}
#Reorder levels according to coefficients if necessary
if(.$orders[[term]]=='coef'){
coeffs = aggregate(.$preds[,paste('fit',term,sep='.')],list(levels),mean)
names(coeffs) = c('term','coeff')
coeffs = coeffs[order(coeffs$coeff),]
levels = factor(levels,levels=coeffs$term,ordered=T)
}
#Coefficients
coeffs = aggregate(.$preds[,paste(c('fit','se.fit'),term,sep='.')],list(levels),mean)
names(coeffs) = c('term','coeff','se')
coeffs = within(coeffs,{
lower = coeff-se
upper = coeff+se
})
par(mar=c(0,5,3,0),las=1)
with(coeffs,{
xs = 1:max(as.integer(term))
ylim = c(min(exp(lower)),max(exp(upper)))
if(ylim[1]<0.5*min(exp(coeff))) ylim[1] = 0.5*min(exp(coeff))
if(ylim[2]>2*max(exp(coeff))) ylim[2] = 2*max(exp(coeff))
plot(as.integer(term),exp(coeff),xlim=range(xs),ylim=ylim,pch=2,cex=1.5,xaxt='n',ylab='',log='y')
mtext('Coefficient',side=2,line=4,las=0,cex=0.8)
abline(h=1,lty=2)
abline(v=xs,lty=1,col='grey')
segments(as.integer(term),exp(lower),as.integer(term),exp(upper))
arrows(as.integer(term),exp(lower),as.integer(term),exp(upper),angle=90,code=3,length=0.05)
axis(3,at=xs,labels=levels(term)[xs])
})
#Distribution
distrs = aggregate(.$preds[,1],list(levels,.$preds[,.$focus]),length)
names(distrs) = c('term','focus','count')
distrs = merge(distrs,aggregate(list(total=distrs$count),list(focus=distrs$focus),sum))
distrs$prop = with(distrs,count/total)
par(mar=c(5,5,0,0),las=1)
xlab = .$labels[[variable]]
if(is.null(xlab)) xlab = variable
ylab= .$labels[[.$focus]]
if(is.null(ylab)) ylab = .$focus
with(distrs,{
xs = 1:max(as.integer(term))
ys = 1:max(as.integer(focus))
plot(NA,xlim=range(xs),ylim=range(ys),xaxt='n',yaxt='n',xlab=xlab,ylab='')
abline(v=xs,lty=1,col='grey')
axis(1,at=xs,labels=levels(term)[xs])
abline(h=ys,lty=1,col='grey')
axis(2,at=ys,labels=levels(focus)[ys])
mtext(ylab,side=2,line=4,las=0,cex=0.8)
points(as.integer(term),as.integer(focus),cex=sqrt(prop)*12)
})
#Influence
par(mar=c(5,0,0,3),las=1)
ys = 1:nrow(.$influences)
with(.$influences,{
plot(NA,xlim=range(exp(get(term))),ylim=range(ys),yaxt='n',xlab='Influence')
abline(h=ys,lty=1,col='grey')
abline(v=1,lty=2)
points(exp(get(term)),ys,type='o',pch=16,cex=1.8)
axis(4,at=ys,labels=level)
})
}
#' Plot a CDI plot for each of the terms in the model
#'
#' @name Influence$cdiPlotAll
#' @param done A function that is called (with the term string as an argument) after each CDI plot is generated. Optional.
#' @return None
#'
Influence$cdiPlotAll <- function(.,done=function(term){
cat("cdiPlot for",term,". Press enter for next")
scan()
}){
for(term in .$terms) {
if(term!=.$focus){
.$cdiPlot(term)
done(term)
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.