#' Change units of columns in a data frame, e.g. measure relative to one standard deviation
#'
#' useful to make coefficients in regressions better comparable
scale.data.cols = function(dat,unit="sd",cols = setdiff(colnames(dat),exclude), exclude=NULL, dummy01=TRUE) {
units = lapply(colnames(dat), function(col) {
list(unit = "original",size=1, descr="")
})
dummy.unit = list(unit="dummy",size=1,descr="unit: dummy")
unit.val = rep
names(units)=cols
for (col in cols) {
val = dat[[col]]
if (is.dummy.val(val, dummy01)) {
attr(dat[[col]],"unit") = dummy.unit
next
}
nval = scale.values(val, unit=unit, var.name=col)
dat[[col]] = nval
}
attr(dat,"units") = units
dat
}
is.dummy.val = function(val, dummy01=TRUE) {
if (!is.numeric(val)) {
return(TRUE)
}
if (dummy01) {
if (all(unique(val) %in% c(0,1,NA))) {
return(TRUE)
}
}
return(FALSE)
}
make.val.unit = function(val, unit.code=c("sd","1sd","2sd","4sd","6sd","10-90","20-80","25-75","min-max","dummy", "original")) {
#unit = "10-90"; val = dat$account_days
if (unit.code=="original") {
return(list(code = "original",size=1, descr=""))
}
val = na.omit(val)
unit.code = unit.code[1]
if (unit.code=="sd")
unit.code="1sd"
if (substring(unit.code,2)=="sd") {
denom = sd(val) * as.numeric(substring(unit.code,1,1))
} else if (unit.code=="min-max") {
denom = max(val)-min(val)
} else if (unit.code=="dummy") {
denom = 1
} else {
start = as.numeric(substring(unit.code,1,2))
end = as.numeric(substring(unit.code,4,5))
denom = diff(quantile(val, c(start, end)/100))
}
list(code=unit.code, size=denom, descr=paste0("unit ", unit.code, ": ", signif(denom,3)))
}
scale.values = function(val,
unit.code=c("sd","1sd","2sd","4sd","6sd","10-90","20-80","25-75","min-max","nochange"),
var.name=""
) {
restore.point("scale.values")
unit = make.val.unit(val, unit.code)
ret = val / unit$size
attr(ret,"unit") = unit
ret
}
add.values.for.effect = function(val, effect, overwrite=TRUE) {
restore.point("add.values.for.effect")
val = na.omit(val)
if (!overwrite)
old.effect = effect
if (effect$type=="quantile") {
effect$baseline.val = median(val)
effect$low.val = quantile(val,effect$low.percent)
effect$high.val = quantile(val,effect$high.percent)
} else if (effect$type == "sd") {
m = mean(val)
s = sd(val)
effect$baseline.val = m
effect$low.val = m-s*0.5*effect$size
effect$high.val = m+s*0.5*effect$size
} else if (effect$type == "dummy") {
effect$baseline.val = median(val)
effect$low.val = 0
effect$high.val = 1
# original values
} else {
m = mean(val)
effect$baseline.val = m
effect$low.val = m-0.5*effect$size
effect$high.val = m+0.5*effect$size
}
if (!overwrite) {
effect[names(old.effect)] = old.effect
}
effect
}
get.effect.base = function(val=NULL, effect=c("sd","1sd","2sd","4sd","6sd","10-90","20-80","25-75","min-max","dummy", "original"), var=NULL) {
if (is.character(effect)) {
code= effect[1]
if (code=="sd")
code="1sd"
if (code=="original") {
effect=list(code = "original",type="original",var=var, descr="")
} else if (substring(code,2)=="sd") {
effect=list(code = code,type="sd",var=var, size= as.numeric(substring(code,1,1)))
} else if (code=="dummy") {
effect = list(code=code, type=code)
} else {
start = as.numeric(substring(code,1,2))
end = as.numeric(substring(code,4,5))
effect = list(code = code,type="quantile",var=var, low.percent=start/100, high.percent=end/100)
}
}
if (!is.null(val))
effect=add.values.for.effect(val, effect)
effect
}
get.effect.base.df = function(dat, numeric.effect = "10-90", dummy01 = TRUE, effect.bases=NULL) {
restore.point("get.effect.base.df")
dummy.effect = list(type="dummy")
val = dat[[1]]
li = lapply(colnames(dat), function(col) {
val = dat[[col]]
if (is.null(effect.bases[[col]])) {
if (is.dummy.val(val)) {
effect= get.effect.base(val, "dummy", var=col)
} else {
effect = get.effect.base(val, numeric.effect, var=col)
}
} else {
effect = add.values.for.effect(val,effect.bases[[col]], overwrite=FALSE)
}
as.data.frame(c(list(var=col,code=effect$code, type=effect$type),
effect[c("baseline.val","low.val","high.val")]
))
})
df = do.call(rbind,li)
rownames(df)=NULL
#df = rbindlist(li)
df$val.descr = paste0(signif(df$low.val,3)," ",signif(df$baseline.val,3)," ",signif(df$high.val,3))
df
}
#' Simple function to compute effect sizes used by effectplot
#' @export
get.effect.sizes = function(reg, dat,
vars=intersect(colnames(dat), names(coef(reg))),
scale.depvar=NULL, depvar = names(reg$model)[[1]],data.fun = NULL,numeric.effect="10-90", dummy01=TRUE, predict.type="response", effect.bases=NULL, compute.se=FALSE, ci.prob=0.95, repl.for.se=10000) {
restore.point("get.effect.sizes")
library(tidyr)
ebd = get.effect.base.df(dat, numeric.effect=numeric.effect, dummy01=dummy01, effect.bases=effect.bases)
nr = length(vars)*2
base.df = as.data.frame(t(ebd$baseline.val))
colnames(base.df) = colnames(dat)
base.df
key.df = expand.grid(list(level=c("low","high"),var=vars))
df = cbind(key.df,base.df)
var.ebd = match(vars,ebd$var)
names(var.ebd) = vars
row = 0
for (var in vars) {
row = row+1
df[row, var] = ebd$low.val[var.ebd[var]]
row = row+1
df[row, var] = ebd$high.val[var.ebd[var]]
}
df
newdata = df[,-(1:3)]
# compute values of dependent data like
# df$x_sqr = df$x^2
if (!is.null(data.fun)) {
df = data.fun(df)
}
if (is(reg,"felm")) {
pred = predict.felm(reg,newdata=newdata, use.fe = FALSE)
} else {
pred = predict(reg,newdata=newdata, type=predict.type)
}
scale.y =1
if (!is.null(scale.depvar)) {
scale.y = make.val.unit(dat[[depvar]], scale.depvar)$size
pred = pred / scale.y
}
rdf = cbind(key.df, pred)
d = spread(rdf, key = level, value=pred)
d$effect = d$high-d$low
d$abs.effect = abs(d$effect)
d$effect.sign = sign(d$effect)
d$base.descr = ebd$val.descr[var.ebd]
if (compute.se) {
newdf = model.matrix.from.new.data(newdata,reg)
se.df = compute.effect.size.se(reg, repl.for.se,newdata=newdf,scale=scale.y)
d = cbind(d, se.df)
}
d
}
model.matrix.from.new.data = function(newdata, reg, na.action=na.pass,...) {
object=reg
tt <- terms(object)
Terms <- delete.response(tt)
na.action=na.pass
m <- model.frame(Terms, newdata, na.action = na.action, xlev = reg$xlevels)
X <- model.matrix(Terms, m, contrasts.arg = reg$contrasts)
X
}
compute.effect.size.se = function(reg, repl.for.se,newdata,scale=1, add.intercept = FALSE, ci.prob=c(0.05,0.95),...) {
restore.point("compute.effect.size.se")
newmatrix = as.matrix(newdata)
if (add.intercept)
newmatrix=cbind(intercept=1,newmatrix)
pred.fun = get.predict.from.coef.fun(reg)
# check if pred.fun is null
if (is.null(pred.fun)) {
warning("No predict.from.coef function for model of class ", class(reg)[1], " skip computation of se and confidence intervals for effect.")
return(NULL)
}
coef.mat = draw.from.estimator(n=repl.for.se,coef=reg$coef, vcov=vcov(reg))
# P rows of newmatrix (number of predictions)
# R number of draws from estimator
# P x R
mat = apply(coef.mat,1,pred.fun,newmatrix=newmatrix, reg=reg )
mat[,1:2]
pred.vec = as.numeric(apply(coef.mat,1,pred.fun,newmatrix=newmatrix, reg=reg ))
pred.mat = matrix(pred.vec,ncol=2, byrow=TRUE)
pred.effect = (pred.mat[,2]-pred.mat[,1]) / scale
effect.mat = matrix(pred.effect, nrow=repl.for.se, byrow=TRUE)
effect.se = apply(effect.mat,2,sd, na.rm=TRUE)
if (length(ci.prob)==1)
ci.prob = c((1-ci.prob)/2, 1- (1-ci.prob)/2)
ci = apply(effect.mat,2, quantile, probs=ci.prob, na.rm=TRUE)
data.frame(effect.se = effect.se, ci.low=ci[1,], ci.high=ci[2,])
}
examples.effectplot = function() {
# simulate some data
set.seed(12345)
n = 1000
x = rnorm(n)
z = rnorm(n)
q = rnorm(n)
# binary outcome
y = ifelse(pnorm(1 + 0.5*x + 0.25*x^2 - 0.5*z + rnorm(n))>0.5, 1, 0)
data = data.frame(y,x,z,q)
# Logit regression
reg = glm(y~x + x^2 + z +q, data=data, family="binomial")
effectplot(reg,main="Effects", horizontal=TRUE, show.ci=TRUE)
coefplot(reg)
# An example with factors
T = 10
x = sample(1:4, T, replace = TRUE)
y = x^2 + rnorm(T)
xf = as.character(x)
# effectplot can currently not handle factor variables
# in the regression formula
reg = lm(y~xf)
effectplot(reg)
# Workaround: first explicitly generate a
# data.frame with all dummy variables
df = data.frame(y,xf,x)
dat = expanded.regression.data(y~xf,df)
reg = lm(regression.formula(dat), data=dat)
reg
effectplot(reg)
# Short-cut
reg = expanded.regression(lm(y~xf))
effectplot(reg)
# An example for felm
T = 120
x1 <- rnorm(T)
x2 <- rnorm(T)
fe1 = sample(c("yes","no"), T, replace=TRUE)
fe2 = sample(c("a","b","c"), T, replace=TRUE)
y <- -1*x1 + 3*x2 + (fe1=="yes") + 2*(fe2=="a") + (fe2=="b")+ rnorm(T)
dat = data.frame(y,x1,x2,fe1,fe2)
reg = felm(y~x1+x2 | fe1+fe2)
effectplot(reg)
}
name.of.depvar = function(reg) {
if (is(reg,"felm")) return(reg$lhs)
names(reg$model)[[1]]
}
#' Plot for regressions to compare effects sizes of normalized changes in the explanatory variables
#'
#' The plot shall help to compare magnitudes of the influence of different explanatory variables. The default effect is "10-90", i.e. the effect of when -ceteris paribus- changing an (numeric) explanatory variable from its 10% quantile value to its 90% quantile. For dummy variables, we just consider the effect from changing it from 0 to 1.
#'
#' @param reg the results from a regression, e.g. from a call to lm or glm
#' @param dat default = the data frame the regression was estimated from
#' @param vars the explanatory variables that shall be shown in the plot
#' @param numeric.effect a code describing the lowest and highest values of numeric explanatory variables used to calculate the effect, e.g. "05-95" means taking the effect of moving from the 5 percent to the 95 percent quantile.
#' @param dummy01 shall numeric varibles that have only 0 and 1 as values be treated as a dummy variables?
#' @param sort if TRUE (default) sort the effects by size
#' @param scale.depvar a scaling for the dependent variable
#' @param depvar name of the dependent variable
#' @param xlab, ylab labels
#' @param colors colors for positive values (pos) and negative values (neg)
#' @param horizontal shall bars be shown horizontally?
#' @param show.ci shall confidence intervals be shown?
#' @param ci.prob left and right probability level for confidence intervals
#' @param num.ticks the number of tick marks on the effect size axis
#' @param add.numbers shall the effect sizes be added as numbers in the plot?
#' @param numbers.align How shall the effect size numbers be aligned: "left","center", "right" align all numbers at the same horizontal posistion for all variables. "left_of_bar_end" and "right_of_bar_end" align them at the end of each bar.
#' @param numbers.vjust Vertical adjustment of the numbers
#' @param left.margin extra margin left of bars as fraction of maximum bar width
#' @param right.margin extra margin right of bars as fraction of maximum bar width
#' @param signif.digits number of significant digits for effect sizes
#' @param round.digits number of digits effect sizes shall be rounded to
#' @param ... further arguments passed to qplot. E.g. you can set "main" to specify a title of the plot.
#' @export
effectplot = function(reg, dat=get.regression.data(reg,source.data=source.data),source.data = NULL, main = NULL,
vars=intersect(colnames(dat), names(coef(reg))),
ignore.vars = NULL,
numeric.effect="10-90", dummy01=TRUE,
sort = TRUE,
scale.depvar=NULL, depvar = name.of.depvar(reg),
xlab="Explanatory variables\n(low baseline high)",
ylab=paste0("Effect on ", depvar,""),
colors = c("pos" = "#11AAAA", "neg" = "#EE3355"),
effect.sizes=NULL, effect.bases = NULL, horizontal=TRUE,
show.ci = FALSE, ci.prob =c(0.05,0.95), num.ticks=NULL,
add.numbers=TRUE,
numbers.align = c("center","left","right","left_of_bar_end","right_of_bar_end")[1],
numbers.vjust = ifelse(show.ci,0,0.5),
left.margin = 0, right.margin=0,
signif.digits = 2, round.digits=8,
alpha = 0.8,...
) {
library(ggplot2)
#
# org.dat = dat
#
# if (missing(model.matrix)) {
# model.matrix = dat
# try({
# model.matrix <- model.matrix(reg)
# if (all(model.matrix[,1]==1))
# model.matrix <- model.matrix[,-1, drop=FALSE]
# },silent=TRUE)
# }
# if (!is.null(model.matrix)) {
# dat = cbind(dat[[depvar]],as.data.frame(model.matrix))
# colnames(dat)[1] = depvar
# }
# rownames(dat) = NULL
# if (missing(vars))
# vars = colnames(dat)
restore.point("effectplot")
vars = setdiff(vars, ignore.vars)
if (is.null(effect.sizes)) {
es = get.effect.sizes(reg,dat, vars,depvar=depvar, scale.depvar=scale.depvar,numeric.effect=numeric.effect, dummy01=dummy01, effect.bases=effect.bases, compute.se=show.ci, ci.prob=ci.prob)
} else {
es = effect.sizes
}
es$coef.name = paste0(es$var,"\n",es$base.descr)
if (sort)
es = es[order(es$abs.effect), ]
es$sign = ifelse(sign(es$effect)>=0,"pos","neg")
# Set factor name in order to show sorted plot
es$name = factor(es$coef.name, es$coef.name)
es$round.effect = round(es$effect,round.digits)
es$round.effect = sapply(es$round.effect, signif, digits=signif.digits)
add.str = ifelse(es$round.effect>=0, " ","")
es$round.effect = paste0(add.str,es$round.effect)
#qplot(data=es, y=abs.effect, x=name, fill=sign, geom="bar", stat="identity",xlab=xlab,ylab=ylab) + coord_flip() +
if (show.ci) {
es$abs.ci.low = es$ci.low * es$effect.sign
es$abs.ci.high = es$ci.high * es$effect.sign
}
#p = qplot(data=es, y=abs.effect, x=name, fill=sign, geom="bar", stat="identity",xlab=xlab,ylab=ylab,...) + scale_fill_manual(values=colors)
p = ggplot(data=es, aes(y=abs.effect, x=name, fill=sign)) +
geom_bar(stat = "identity", alpha=alpha) +
scale_fill_manual(values=colors) +
xlab(xlab) + ylab(ylab)
if (!is.null(main))
p = p + ggtitle(main)
if (horizontal)
p = p+coord_flip() #+ theme_wsj()
if (show.ci) {
p = p +geom_errorbar(aes(ymin=abs.ci.low, ymax=abs.ci.high), position="dodge", width=0.25, colour=gray(0.3, alpha=0.6))
}
if (!is.null(num.ticks)) {
number_ticks <- function(n) {function(limits) pretty(limits, n)}
p = p + scale_y_continuous(breaks=number_ticks(num.ticks))
}
if (add.numbers) {
if (numbers.align=="left_of_bar_end") {
p = p + geom_text(aes(x=name, y=abs.effect,
ymax=max(abs.effect)*(1+right.margin), ymin=-(left.margin*max(abs.effect)),
label=round.effect, hjust=1, vjust=numbers.vjust),
position = position_dodge(width=1))
} else if (numbers.align=="right_of_bar_end") {
p = p + geom_text(aes(x=name, y=abs.effect,
ymax=max(abs.effect)*(1+right.margin), ymin=-(left.margin*max(abs.effect)),
label=round.effect, hjust=0,vjust=numbers.vjust))
} else {
.POS = 0.5
if (numbers.align == "left") {
.POS = 0
} else if (numbers.align=="center") {
.POS = 0.5
} else if (numbers.align=="right") {
.POS = 1
} else if (is.numeric(numbers.align)) {
.POS = numbers.align
}
p = p + geom_text(aes(x = name, y = .POS*max(abs.effect),
ymax=max(abs.effect)*(1+right.margin),
ymin=-(left.margin*max(abs.effect)),
label=round.effect, hjust=0,vjust=numbers.vjust))
}
}
#numbers.align = "left_of_bar_end","right_of_bar_end","center","left","right"
p
}
#' Graphically compare sizes of regression coefficient
#'
#' mainly useful if regression is run on data that has been normalized by set.data.units
#' @export
coefplot = function(reg,data=NULL, sort=TRUE, remove.intercept=TRUE, show.units=!is.null(data), xlab="Explanatory variables", ylab=paste0("Coefficient"),
colors = c("pos" = "#11AAAA", "neg" = "#EE3355"), horizontal=TRUE, vars=names(coef(reg)),ignore.vars = NULL) {
restore.point("coefplot")
library(ggplot2)
restore.point("coef.plot")
coef = coef(reg)
if (remove.intercept)
coef = coef[-1]
vars = setdiff(vars, ignore.vars)
coef = coef[names(coef) %in% vars]
df = data.frame(coef.name=names(coef),coef=coef, abs.coef = abs(coef), sign=ifelse(sign(coef)>=0,"pos","neg"), stringsAsFactors=FALSE)
if (show.units) {
cols = intersect(names(coef),colnames(data))
units = lapply(df$coef.name, function(col) {
if (col %in% cols)
return(attr(data[[col]],"unit"))
return(list(unit = "original",size=1, descr=""))
})
units.descr = lapply(units, function(unit) unit$descr)
df$coef.name = paste0(df$coef.name,"\n",units.descr)
}
if (sort)
df = df[order(abs(coef)), ]
# Set factor name in order to show sorted plot
df$name = factor(df$coef.name, df$coef.name)
alpha = 1
p = ggplot(data=df, aes(y=abs.coef, x=name, fill=sign)) +
geom_bar(stat = "identity", alpha=alpha) +
scale_fill_manual(values=colors) +
xlab(xlab) + ylab(ylab)
if (horizontal)
p = p+coord_flip() #+ theme_wsj()
p
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.