#
# Rstatistic.R
#Fri 19 Jan 2007 11:06:44 PM CET
# contains simple statistics to evaluate consulting questions
sizesDesc = function(s) {
col.frame(list(
mean = mean(s),
median = median(s),
stddev = sqrt(var(s)),
quantiles = quantile(s)
), do.paste = " ", digits = 1)
}
compareSamples = function(l) {
desc = data.frame(lapply(l, function(e)sizesDesc(e)));
print(desc);
tests = col.frame(list(
test.t = t.test(l[[1]], l[[2]])$p.value,
test.wilcoxon = wilcox.test(l[[1]], l[[2]])$p.value
));
print(tests);
}
df2numeric = function(df) apply(df, 2, function(col)as.numeric(as.vector(col)));
expandCounts = function(tab) unlist(apply(tab, 1, function(r){rep(r[1], r[2])}));
chisq.test.robust = function(tab, bootstrapCellCount = 5, B = 5e3) {
# reduce table by removing 0-marginals and check for degeneration
tab = tab[, !apply(tab, 2, function(c)all(c == 0))];
if (is.vector(tab)) return(list(p.value = NA));
tab = tab[!apply(tab, 1, function(r)all(r == 0)), ];
if (is.vector(tab)) return(list(p.value = NA));
# determine whether to bootstrap
r = if (any(tab < bootstrapCellCount))
chisq.test(tab, simulate.p.value = T, B = B) else
chisq.test(tab);
r
}
# depends on coin package <!>, unfinished
armitage.test.robust = function(formula, df, scores) {
tab = table(df);
# only eliminate 0-rows of table from score vector
zRows = sapply(1:dim(tab)[1], function(i){ all(tab[i,] == 0) });
scores[[1]] = scores[[1]][!zRows];
r = independence_test(formula, df, teststat = "quad", scores = scores);
r
}
# simulations in project 2014-02-Microsatellites
logSumExpRaw = function(v, pivot = median(v))(log(sum(exp(v - pivot))) + pivot)
logSumExpPivot = logSumExpMax = function(v)logSumExpRaw(v, pivot = max(v))
logSumExp = function(x) {
Imx = which.max(x);
log1p(sum(exp(x[-Imx] - x[Imx]))) + x[Imx]
}
logSumExp_add = function(v, bins) {
diff = abs(bins - v);
diffB = pop(bins) - shift(bins);
if (max(diff) > max(diffB)) { # bin-reshuffeling
Ibin = which.min(diffB);
bins[Ibin] = logSumExp(bins[c(Ibin, Ibin + 1)]);
bins[Ibin + 1] = v;
} else {
Ibin = which.min(abs(diff));
bins[Ibin] = logSumExp(c(v, bins[Ibin]));
}
return(sort(bins, na.last = T, decreasing = T));
}
logSumExpIter_raw = function(x, Nbins = 30) {
if (length(x) < Nbins) return(x);
lse = x[1:Nbins];
for (e in x[-(1:Nbins)]) lse = logSumExp_add(e, lse);
return(logSumExp(lse));
}
logSumExpIter = function(x, Nbins = 30) {
return(logSumExp(logSumExpIter_raw(x)));
}
# sumExp = function(x) {
# Imx = which.max(x);
# sum(exp(x - x[Imx])) * exp(x[Imx])
# }
sumExp = function(x)exp(logSumExp(x))
meanExp = function(x)(sumExp(x)/length(x))
# rejFrac = function(x, alpha = 0.05) {
# x = na.omit(x);
# f = count(x <= alpha) / length(x);
# f
# }
rejFrac = function(x, alpha = 0.05)mean(x <= alpha, na.rm = T);
vector.std = function(v, C = 1)(C * v / sum(v));
vector.std.log = function(v, C = 0)(v - (logSumExp(v) - C));
#
# <p> ml methods
#
lhWrapperFunctions = c("initialize",
"parsScale", "parsMap", "parsMapInv", "parsStart", "parsNames", "lh", "null2alt", "alt2null"
);
# <!> transition to S4-objects
lhGetWrapper = function(prefix, self, ...) {
createNullWrapper = F;
f = list();
if (substr(prefix, nchar(prefix) - 3, nchar(prefix)) == "null") {
createNullWrapper = T;
prefix = substr(prefix, 1, nchar(prefix) - 5);
}
for (n in lhWrapperFunctions) {
f[[n]] = mget(sprintf("%s.%s", prefix, n), envir = globalenv(), ifnotfound=list(NULL))[[1]];
}
f$self = if (is.null(self)) { if (is.null(f$initialize)) list(...) else f$initialize(...) } else self;
if (createNullWrapper) {
f1 = f;
self = f1$self = f$self;
f1$parsStart = function(self){ f$alt2null(self, f$parsStart(self)) };
f1$parsScale = function(self){ f$alt2null(self, f$parsScale(self)) };
f1$parsMap = function(self, p){ f$alt2null(self, f$parsMap(self, f$null2alt(self, p))) };
f1$parsMapInv = function(self, p){ f$alt2null(self, f$parsMapInv(self, f$null2alt(self, p))) };
f1$lh = function(self){ lhRaw = f$lh(self); function(p)lhRaw(f$null2alt(self, p)) };
return(f1);
}
f
}
lhCopyWrapper = function(name, template) {
for (f in lhWrapperFunctions) {
g = mget(sprintf("%s.%s", template, f), envir = globalenv(), ifnotfound=list(NULL))[[1]];
if (!is.null(g)) eval.parent(parse(text = sprintf("%s.%s = %s.%s;", name, f, template, f)));
}
}
lhInit = function(lhWrapper) {
}
mapU = function(y){ -log(1/y - 1) }
map2U = function(z){ 1/(1 + exp(-z)) }
# one-dimensional estimation
lhMlEstimatorOD = function(lhWrapper = NULL, start = NULL, c = NULL, ...) {
if (is.null(c)) c = list(tol = .Machine$double.eps^0.25);
f = lhGetWrapper(lhWrapper, c$self, ...);
lhRaw = f$lh(f$self);
lh = function(p) { lhRaw(mapU(f$parsMap(f$self, p))) }
o = try(optimize(lh, lower = 0, upper = 1, tol = c$tol, maximum = T));
r = list(par = mapU(f$parsMap(f$self, o$maximum)), par.os = o$maximum, value = o$objective);
r
}
# multi-dimensional estimation
lhMlEstimatorMD = function(lhWrapper = NULL, start = NULL, c = NULL, ...) {
if (is.null(c)) c = list(do.sann = F, sann.cycles = 1000);
f = lhGetWrapper(lhWrapper, c$self, ...);
eps = 1e-5;
#if (!is.null(start)) { starts = matrix(start, nrow = 1); }
if (is.null(start)) start = f$parsStart(f$self);
starts = if (!is.matrix(start)) matrix(as.numeric(unlist(start)), nrow = 1) else start;
parscale = f$parsScale(f$self);
lhRaw = f$lh(f$self);
lh = function(p) { lhRaw(f$parsMap(f$self, p)) }
os = apply(starts, 1, function(s) {
s = f$parsMapInv(f$self, s);
o = try(optim(s, lh, method = "Nelder-Mead",
control = list(fnscale = -1, parscale = parscale, maxit = 1000),
));
if (class(o) == "try-error") return(NA);
if (0) { # if (o$convergence > 0 || c$do.sann) { # Nelder-Mead failed to converged
o1 = try(optim(s, lh, method = "SANN",
control = list(fnscale = -1, parscale = parscale, maxit = c$sann.cycles),
));
#if (class(o1) == "try-error") return(NA);
if (o$convergence > 0 || o1$value > o$value) o = o1;
}
o$par.os = o$par; # parameter values in optimiztation space
o$par = f$parsMap(f$self, o$par);
o
});
if (all(is.na(os))) return(NA);
vs = sapply(os, function(o){o$value});
arg.max = which.max(vs);
estimate = os[[arg.max[1]]];
fisher = list();
#if (!is.null(c$computeFisher) & c$computeFisher)
if (!is.null(c$computeFisher)) fisher = estimate.fisher(d, estimate, fisher.eps = 1e-1);
r = c(estimate, fisher);
r
}
lhMlEstimator = function(lhWrapper = NULL, start = NULL, c = NULL, ...) {
f = lhGetWrapper(lhWrapper, c$self, ...);
r = if (length(f$parsStart(f$self)) > 1) {
lhMlEstimatorMD(lhWrapper, start, c, ...);
} else if (length(f$parsStart(f$self)) == 1) {
lhMlEstimatorOD(lhWrapper, start, c, ...);
} else { # null hypothesis w/o nuisance parameters
r = f$lh(f$self)();
}
r
}
lhLRtest = function(lhWrapper = NULL, start = NULL, c = list(do.sann = F, sann.cycles = 1000), ...) {
f = lhGetWrapper(lhWrapper, NULL, c$self, ...); # f$self is likelihood object and absorbs ellipsis parameters
self = f$self;
if (is.null(start)) start = f$parsStart(self);
startNull = if (is.matrix(start))
t(apply(start, 1, function(r)f$alt2null(self, r))) else
f$alt2null(self, start);
e.null = lhMlEstimator(sprintf("%s.%s", lhWrapper, "null"), startNull, c(c, list(self = self)));
start = rbind(start, f$null2alt(self, e.null$par));
e.alt = lhMlEstimator(lhWrapper, start, c(c, list(self = self)));
# <p> calcualte degrees of freedom
st = f$parsStart(self);
df = length(st) - length(f$alt2null(self, st));
stat = 2 * (e.alt$value - e.null$value);
list(ll.null = e.null$value, ll.alt = e.alt$value,
test.stat = stat, p = 1 - pchisq(stat, df), df = df, par.null = e.null$par, par.alt = e.alt$par
)
}
#
# lh-functions based on likelihood specification
#
# Example: see dataAnalysis.R in hwe project
# Example: binomial distribution
# lhBin = function(p, k, N)dbinom(k, N, p)
# spec_lhBin = list(
# ll = "lhBin",
# alt = list(
# start = c(.5), # also specifies number of parameters
# pars = list(list(name = "rho", type = "freq"))
# ),
# null = list(
# start = c(.5), # assume same likelihood and therefore #pars from alternative
# parsFree = 0 # alternative: list free parameters or specify tail from alt
# )
# );
# r = lhMl(spec_lhBin)
#define a function
toF = function(expr, args, env = parent.frame()) {
as.function(c(args, expr), env)
}
logitI = expit = function(x, min = 0, max = 1) { (max - min)/(1 + exp(-x)) + min }
expitD = toF(D(expression((max - min)/(1 + exp(-x)) + min), 'x'), list(x = NULL, min = 0, max = 1));
logit = function(x, min = 0, max = 1) { log((x - min)/(max - x)) }
logitD = toF(D(expression(log((x - min)/(max - x))), 'x'), list(x = NULL, min = 0, max = 1));
# templates assuming X as argument, p as parameter description list
lhArgMappers = list(
freq = "expit(X)",
int = "expit(X, min, max)",
real = "X",
positive = "log1p(exp(X))"
);
lhArgMappersD = list(
freq = NULL, #D(expression(expit(x), 'x')),
int = "expit(X, min, max)",
real = "X",
positive = "log1p(exp(X))"
);
lhArgMappersI = list(
freq = "logit(X)",
int = "logit(X, min, max)",
real = "X",
positive = "log(expm1(X))"
);
lhSpecificationDefaults = list(
# embed null-parameter into alt-parameter space: variables: npars, parsFree, s (specification),
# p: input parameters
# <i>: optimization: substitute literals from start
default = list(mapper = 'c(c(ARGS_FREE), c(ARGS_BOUND))', mapperInv = 'c(ARGS_FREE)')
);
# richest: richest parametrization of the likelihood
# lhInterface: call the likelihood function with a vector (vector) or with separate arguments formula
# the paramters (inline)
lhSpecificationDefault = list(richest = 'alt', lhInterface = 'vector');
lhSpecificationInterfaces = list(
vector = 'function(p, ...) { pm = mapper(p); if (any(abs(pm) > 1e10)) return(-Inf); lf(pm, ...) }',
inline = 'function(p, ...) { pm = mapper(p); if (any(abs(pm) > 1e10)) return(-Inf); lf(ARGS_INLINE, ...) }'
);
#
# <p> logit derivatives
#simulations in 2014-07-Borstkanker/src/borstKankerExp.R
logExpit1 = function(x)log(expit(x))
logExpit = logExpit2 = function(x)-log1p(exp(-x))
logitExp1 = function(x)logit(exp(x))
logitExp = logitExp2 = function(x)-log(expm1(-x))
logExpit1m1 = function(x)log(1 - expit(x))
logExpit1m = logExpit1m2 = function(x)-log1p(exp(x))
logit1mExp1 = function(x)logit(1 - exp(x))
logit1mExp = logit1mExp2 = function(x)log(expm1(-x))
#
# <p> helper functions
#
# mappers for individual parameters
# ps: list of parameters
# mappers: mapper templates to used
# target: name of variable on which to apply
# idcs: indeces to iterate
lhMapperPars = function(ps, mappers, target = 'p', idcs = 1:length(ps)) {
maps = if (length(idcs) == 0) c() else sapply(idcs, function(i) {
p = ps[[i]];
a = gsub("X", sprintf("%s[%s]", target,
deparse(if (length(p$entries)) p$entries else i)), mappers[[p$type]]);
a = mergeDictToString(ps[[i]]$args, a);
a
});
r = paste(maps, collapse = ", ");
r
}
# <!> auto inverse mapping has to heed mapperPost time of application
# mappers map individual arguments, mapper sub-sequently maps the whole vector
lhMapperFunction = function(s, mappers, mapper) {
free = 1:s$parsFree; # idcs of free variables
bound = if(s$parsFree < s$npars) (s$parsFree + 1):s$npars else c(); # idcs of bound variables
mStr = sprintf('function(p){%s}',
mergeDictToString(list(
ARGS_FREE = lhMapperPars(s$pars, mappers, 'p', free),
ARGS_BOUND = lhMapperPars(s$pars, mappers, 'start', bound)
), mapper));
mf = with(s, eval(parse(text = mStr)));
mf
}
lhMapperFunctions = function(s) {
r = list(
mapper = lhMapperFunction(s, lhArgMappers, s$mapper),
mapperInv = lhMapperFunction(s, lhArgMappersI, s$mapperInv)
);
r
}
#' Build wrapper function around likelihood
#'
#' @param template parameter specification used as template (usually richest parametrization tb reduced
#' for other hypotheses)
lhPreparePars = function(pars, defaults = lhSpecificationDefaults$default, spec = lhSpecificationDefault,
template = pars) {
# <p> determine free parameters
t = merge.lists(defaults, pars);
npars = length(template$pars);
if (!is.null(t$parsFree)) {
t$pars = if(t$parsFree == 0) list() else template$pars[(npars - t$parsFree): npars];
}
if (is.null(t$start)) t$start = template$start;
if (is.null(t$parsFree)) t$parsFree = length(t$pars);
# <p> construct mapped likelihood function
fs = mergeDictToString(
list(ARGS_INLINE =
paste(sapply(1:npars, function(i) { sprintf("pm[%s]",
deparse(if (length(template$pars[[i]]$entries)) template$pars[[i]]$entries else i)) }
), collapse = ', ')),
lhSpecificationInterfaces[[spec$lhInterface]]
);
t = merge.lists(t, list(npars = npars));
t = merge.lists(t, lhMapperFunctions(t), list(lf = get(spec$ll)));
f = with(t, eval(parse(text = fs)));
t = merge.lists(t, list(npars = npars, lh = f));
t
}
# types: names of specifications for which to define wrapped functions
# richest: name of specification for model that includes a superset of parameters of all other types
lhPrepare = function(s, types = c('null', 'alt')) {
# <p> preparation
s = merge.lists(lhSpecificationDefault, s);
ri = s[[s$richest]];
# number of parameter groups
npars = length(ri$pars);
# number of parameters of the likelihood function
#Npar = sum(list.kp(ri$pars, 'entries', template = 1));
# <p> build wrappers
m = nlapply(types, function(type) {
defaults = merge.lists(lhSpecificationDefaults$default, lhSpecificationDefaults[[type]]);
lhPreparePars(s[[type]], defaults, s, template = ri)
});
m = merge.lists(s, m);
m
}
# <N> free parameters come first
lhFreePars = function(s, p)with(s, {
r = if (parsFree > 0) {
idcs = unlist(list.kp(s$pars[1:parsFree], 'entries'));
if (length(idcs) == 0) idcs = 1:parsFree;
p[idcs]
} else c();
r
})
# second numeric derivative of x
Dn2f = function(f, x, ..., eps = 1e-5) {
(f(x + 2*eps, ...) + f(x - 2*eps, ...) - 2*f(x, ...))/(4*eps^2)
}
lhEstMLRaw = function(t, start = NULL, ..., optim_method = 'BFGS') {
if (is.null(start)) start = t$start;
for (method in optim_method) {
o = try(OptimizeMultiStart(t$mapperInv(start), t$lh, method = method, ...));
if (!('try-error' %in% class(o))) break();
}
o$par = t$mapper(o$par);
o$ci$ciL = t$mapper(o$ci$ciL);
o$ci$ciU = t$mapper(o$ci$ciU);
o
}
lhEstML = lhMl = function(s, start = NULL, type = 'alt', ..., optim_method = 'BFGS') {
# <p> mapping of parameters
s = lhPrepare(s, types = type);
lhEstMLRaw(s[[type]], start = start, ..., optim_method = optim_method)
}
lfPrepare = function(s, ...) {
lhParsOrig = list(...);
prepare = sprintf('%s%s', s$ll, c('prepare', '_prepare'));
prepareExists = min(which(sapply(prepare, exists)));
lhPars = if (prepareExists < Inf) get(prepare[prepareExists])(...) else lhParsOrig;
lhPars
}
# specification based LR-test
lhTestLR = function(s, startNull = NULL, startAlt = NULL, types = c('null', 'alt'), ...,
optim_method = 'BFGS', addTypeArg = F) {
# <p> general preparation
s = lhPrepare(s, types = types);
null = s[[types[1]]];
alt = s[[types[2]]];
# <p> specific preparation (user defined)
lhPars = lfPrepare(s, ...);
# <p> null hypothesis
if (is.null(startNull))
startNull = if(null$parsFree == 0) NULL else matrix(lhFreePars(null, null$start), nrow = 1);
lhEstMLRawArgs = c(list(t = null, start = startNull), lhPars, list(optim_method = optim_method));
if (addTypeArg) lhEstMLRawArgs = c(lhEstMLRawArgs, list(lh_type__ = 'null'));
o0 = do.call(lhEstMLRaw, lhEstMLRawArgs);
# <p> alternative hypothesis
if (is.null(startAlt)) {
# build from fit under the null
parNull = lhFreePars(null, o0$par);
startAlt = matrix(c(parNull, alt$start[(length(parNull) + 1):length(alt$start)]), nrow = 1);
}
lhEstMLRawArgs = c(list(t = alt, start = startAlt), lhPars, list(optim_method = optim_method));
if (addTypeArg) lhEstMLRawArgs = c(lhEstMLRawArgs, list(lh_type__ = 'alt'));
o1 = do.call(lhEstMLRaw, lhEstMLRawArgs);
# <p> calcualte degrees of freedom
df = length(alt$start) - length(lhFreePars(null, o0$par));
stat = 2 * (o1$value - o0$value);
r = list(ll.null = o0$value, ll.alt = o1$value,
test.stat = stat, p = 1 - pchisq(stat, df), df = df, par.null = o0$par, par.alt = o1$par,
lh.pars = lhPars, lh.pars.orig = lhParsOrig
);
r
}
#
# <p> latest iteration of LH wrapper
#
lhPrepareFormula = function(s, type, formula, data, ...) {
# <o> compute on subset of data <N> cave: missingness
X = model.matrix(model.frame(formula, data = data), data = data);
# <p> expand paramters
t = s[[type]];
ps = t$pars;
fparsI = which(list.key(ps, 'name') == 'formula');
fpars = ps[[fparsI]]; # formula pars
ps[[fparsI]] = merge.lists(ps[[fparsI]], list(name = 'beta', count = ncol(X)));
# <p> determine slots
counts = cumsum(list.key(ps, 'count'));
countsStart = pop(c(1, counts + 1));
ps = lapply(seq_along(ps), function(i)merge.lists(ps[[i]], list(entries = countsStart[i]:counts[i])));
# <p> determine start
start = avu(sapply(ps, function(p)rep(p$start, p$count)));
# <p> map pars
t$pars = ps;
t = lhPreparePars(t, spec = merge.lists(lhSpecificationDefault, s));
t$start = start;
t
}
lhMlFormula = function(s, formula, data, type = 'formula', ..., optim_method = 'BFGS') {
# <p> mapping of parameters
t = lhPrepareFormula(s, type, formula, data, ...);
# <p> extra args
lhPars = lfPrepare(s, formula = formula, data = data, ...);
# <p> call optimizer
lhEstMLRawArgs = c(list(t = t, start = s$start), lhPars, list(optim_method = optim_method));
r = try(do.call(lhEstMLRaw, lhEstMLRawArgs), silent = T);
print(r);
if (class(r) == 'try-error') r = list(par = rep(NA, length(t$start)), value = NA, convergence = 1);
r
}
#
# <p> model manipulation
#
response.is.binary = function(r) {
vs = sort(unique(r));
if (length(vs) != 2) F else all(vs == c(0, 1));
}
#
# <p> clustered data
#
#
# <p> describe relationships (genetic) given a relational (database) model
#
# given relatedness in a data frame of ids and clusterIds, return a list of clusters containing ids
# clusterRelation2list_old = function(r, idName = "id", idClusterName = "idFam", byIndex = T) {
# r = r[, c(idName, idClusterName)];
# ns = sort(unique(r[, 2]));
# # <p> build clusters
# clusters = sapply(ns, function(e)list()); # holds members of clusters
# names(clusters) = ns;
# # <!> we can iterate the list, given it is ordered lexicographically
# for (i in 1:(dim(r)[1])) {
# clN = as.character(r[i, 2]);
# clusters[[clN]] = unlist(c(clusters[[clN]], ifelse(byIndex, i, as.character(r[i, 1]))));
# }
# clusters
# }
clusterRelation2list = function(r, idName = "id", idClusterName = "idFam", byIndex = T) {
r = r[, c(idName, idClusterName)];
clusters = nlapply(sort(unique(r[[idClusterName]])), function(n) {
idcs = which(r[[idClusterName]] == n);
c = if (byIndex) idcs else r[[idName]][idcs];
c
});
clusters
}
# permute clusters of identical size and within clusters
# cluster specification as given by clusterRelation2list assuming byIndex = T
# returned permutation is relative to refIds
permuteClusters = function(cls, refIds = NULL, selectIds = NULL) {
# allow to filter ids from cluster specification
if (!is.null(selectIds)) {
cls = lapply(cls, function(cl)intersect(cl, selectIds));
cls = clusters[sapply(cls, length) > 0];
}
cSizes = sapply(cls, function(e)length(e));
# which cluster sizes are present in the data set?
sizes = unique(cSizes);
# indexable list of ids
refIds = if (is.null(refIds)) sort(unlist(cls));
# final permutation of refIds, such that refIds[perm] gives new order
perm = 1:length(refIds);
for (s in sort(sizes, decreasing = T)) { # permute cluster of same size, permute within cluster
clsS = which(cSizes == s);
p1 = sample(1:length(clsS)); # permute clusters
for (i in 1:length(clsS)) {
p2 = sample(1:s);
# <p> indeces that are to be replaced
indT = which.indeces(cls[[clsS[i]]], refIds);
# <p> indeces where the replacement comes from
indF = which.indeces(cls[[clsS[p1[i]]]][p2], refIds);
# <p> save partial permutation
perm[indT] = indF;
}
}
perm
}
# clusters is a vector with cluster ids
clustersPermute = function(cls) {
permuteClusters(clusterRelation2list(data.frame(id = 1:length(cls), idFam = cls)))
}
#
# <p> wrap model fitting for lm/glm/gee fitters
#
#library("geepack"); # <i> move to init method
regressionMethods = list(
# assume formula to contain random effect
glmr = list(
fit = function(formula, data, clusterCol = NULL, ...) {
glmer(formula, data = data, ...)
},
compare = function(m1, m0){
a = anova(m0$r, m1$r, test = "Chisq");
list(anova = a, m0 = m0, m1 = m1,
#p.value = a[["P(>|Chi|)"]][2],
p.value = a[['Pr(>Chisq)']][2], # as of R 2.15.1
effects0 = coefficients(summary(m0$r))[, "Estimate"],
sdevs0 = coefficients(summary(m0$r))[, "Std. Error"],
effects1 = coefficients(summary(m1$r))[, "Estimate"],
sdevs1 = coefficients(summary(m1$r))[, "Std. Error"]
)
}
),
# use cluster column <!> untested
glmrcl = list(
fit = function(formula, data, clusterCol = NULL, ...) {
f = update(formula, as.formula(Sprintf('~ . + (1|%{clusterCol}s)')));
glmer(f, data = data, ...)
},
compare = function(m1, m0){
a = anova(m0$r, m1$r, test = "Chisq");
list(anova = a, m0 = m0, m1 = m1,
#p.value = a[["P(>|Chi|)"]][2],
p.value = a[['Pr(>Chisq)']][2], # as of R 2.15.1
effects0 = coefficients(summary(m0$r))[, "Estimate"],
sdevs0 = coefficients(summary(m0$r))[, "Std. Error"],
effects1 = coefficients(summary(m1$r))[, "Estimate"],
sdevs1 = coefficients(summary(m1$r))[, "Std. Error"]
)
}
),
glm = list(
fit = function(formula, data, clusterCol = NULL, ...)glm(formula, data = data, ...),
compare = function(m1, m0){
a = anova(m0$r, m1$r, test = "Chisq");
list(anova = a, m0 = m0, m1 = m1,
#p.value = a[["P(>|Chi|)"]][2],
p.value = a[['Pr(>Chi)']][2], # as of R 2.15.1
effects0 = coefficients(summary(m0$r))[, "Estimate"],
sdevs0 = coefficients(summary(m0$r))[, "Std. Error"],
effects1 = coefficients(summary(m1$r))[, "Estimate"],
sdevs1 = coefficients(summary(m1$r))[, "Std. Error"]
)
}
),
lm = list(
fit = function(formula, data, clusterCol = NULL, ...)lm(formula, data = data, ...),
compare = function(m1, m0){
a = anova(m0$r, m1$r);
list(anova = a, m0 = m0, m1 = m1, p.value = a[["Pr(>F)"]][2],
effects0 = coefficients(summary(m0$r))[, "Estimate"],
sdevs0 = coefficients(summary(m0$r))[, "Std. Error"],
effects1 = coefficients(summary(m1$r))[, "Estimate"],
sdevs1 = coefficients(summary(m1$r))[, "Std. Error"]
)
}
),
gee = list(
fit = function(formula, data, clusterCol, family = binomial(), ...) {
if (!length(formula.covariates(formula))) return(NULL);
# geeglm needs ordered clusterIds <!> <N> now pre-processed
#data = data[order(data[[clusterCol]]), , drop = F];
#data = Df_(data, headerMap = listKeyValue(clusterCol, 'regrCompClusters'));
#names(data)[which(names(data) == clusterCol)] = "regrCompClusters"; # hack to make geeglm work
#geeClusters = as.integer(data[[clusterCol]]);
myFamily = get(family$family)();
r = geeglm(formula, data = data, id = regrCompClusters, ...);
r
},
compare = function(m1, m0){
#a = if (is.null(m0)) anova(m1$r) else anova.geeglm(m0$r, m1$r);
a = if (is.null(m0)) anova(m1$r) else { anova(m0$r, m1$r); }
list(anova = a, m0 = m0, m1 = m1, p.value = a[["P(>|Chi|)"]][1],
effects0 = coefficients(summary(m0$r))[, "Estimate"],
sdevs0 = coefficients(summary(m0$r))[, "Std.err"],
effects1 = coefficients(summary(m1$r))[, "Estimate"],
sdevs1 = coefficients(summary(m1$r))[, "Std.err"]
)
}
),
geebin = list(
fit = function(formula, data, clusterCol, ...) {
if (!length(formula.covariates(formula))) return(NULL);
# <!> workaround for generic approach that gives obscure error message
# gee with family = binomial()
r = geeglm(formula, data = data, id = regrCompClusters, family = binomial(), ...);
r
},
compare = function(m1, m0){
#a = if (is.null(m0)) anova(m1$r) else anova.geeglm(m0$r, m1$r);
a = if (is.null(m0)) anova(m1$r) else { anova(m0$r, m1$r); }
list(anova = a, m0 = m0, m1 = m1, p.value = a[["P(>|Chi|)"]][1],
effects0 = coefficients(summary(m0$r))[, "Estimate"],
sdevs0 = coefficients(summary(m0$r))[, "Std.err"],
effects1 = coefficients(summary(m1$r))[, "Estimate"],
sdevs1 = coefficients(summary(m1$r))[, "Std.err"]
)
}
),
surv = list(
fit = function(formula, data, clusterCol, ...)
list(r = coxph(formula, data = data), formula = formula, data = data),
compare = function(m1, m0){
vars0 = formula.predictors(m0$r$formula, m0$r$data);
Nvars0 = length(vars0);
vars1 = formula.predictors(m1$r$formula, m1$r$data);
Nvars1 = length(vars1);
p.value = if (Nvars0 == 0 || (Nvars1 - Nvars0) == 1) {
v = setdiff(vars1, vars0);
summary(m1$r$r)$coefficients[v, 'Pr(>|z|)']
} else {
anova(m0$r$r, m1$r$r)[2, 'P(>|Chi|)']
}
r = list(
p.value = p.value,
effects0 = coefficients(summary(m0$r$r)),
effects1 = coefficients(summary(m1$r$r))
);
r
}
)
);
CompleteRows = function(f1, data) {
vars = if (is.character(f1)) f1 else all.vars(as.formula(f1));
rows = apply(data[, vars, drop = F], 1, function(r)all(!is.na(r)));
r = which(rows);
r
}
# <!> clusterIds is needed as argument although just forwarded
regressionFit = function(f, data, type, ...) {
r = regressionMethods[[type]]$fit(f, data, ...);
list(type = type, r = r)
}
regressionCompare = function(m1, m0) {
r = regressionMethods[[m1$type]]$compare(m1, m0);
r
}
regressionCompareModelsRaw = function(f1, f0, data, type = "lm", clusterCol = NULL, ...) {
f0 = as.formula(f0);
f1 = as.formula(f1);
# <p> jointly trim data according to missing data
#rows = which(apply(data[, c(formula.vars(f1), clusterCol)], 1, function(r)all(!is.na(r))));
# more robust version
row.names(data) = NULL;
#rows = as.integer(row.names(model.frame(f1, data = data)));
# robust for random effects
#rows = apply(data[, all.vars(as.formula(f1)), drop = F], 1, function(r)!any(is.na(r)));
#d0 = data[rows, ];
# <p> general cluster processing: order (<!> required for geeglm, rename <!> required for geeglm
if (Nif(clusterCol)) {
data = data[order(data[[clusterCol]]), , drop = F];
data = Df_(data, headerMap = listKeyValue(clusterCol, 'regrCompClusters'));
}
d0 = model_matrix_from_formula(f1, data, returnComplete = T)$data;
# <p> fit and compare models
m1 = regressionFit(f1, data = d0, type = type, clusterCol = clusterCol, ...);
m0 = regressionFit(f0, data = d0, type = type, clusterCol = clusterCol, ...);
a = regressionCompare(m1, m0);
a
}
permuteDefault = list(
p.value = 0, sdev.rel = .3, Nchunk = 1e3,
nuisanceCovariates = NULL, .clRunLocal = T
);
# idCol: used for permutation: column specifying identiy of individuals: could be filled automatically <i>
# permute:
# sdev.rel: sdev relative to p.value to decide how often to permute
regressionCompareModels = function(f1, f0, data, type = "lm", clusterCol = NULL, ...,
permute = permuteDefault) {
permute = merge.lists(permuteDefault, permute);
r = regressionCompareModelsRaw(f1, f0, data, type, clusterCol, ...);
if (!is.null(r) && !is.null(r$p.value) && !is.na(r$p.value) && r$p.value < permute$p.value)
r = regressionCompareModelsPermuted(f1, f0, data, type, clusterCol, ..., permute = permute);
r
}
#
# <p> permuted cluster regression
#
regressionCompareModelsPermuted = function(f1, f0, data, type = "lm", clusterCol = "cluster", ...,
idCol = "id", permute = permuteDefault, Nmax = 1e5) {
# <p> data p-value
a.data = regressionCompareModelsRaw(f1, f0, data, type, clusterCol = clusterCol, ...);
p.data = a.data$p.value;
# <p> logging
Log(sprintf("Permuting Regression: %s [p = %.2e]", paste(as.character(f1), collapse = " "), p.data), 4);
# <p> permutation variables indeces
pvs = setdiff(formula.covariates(f1), permute$nuisanceCovariates);
# <p> precompute cluster data structure
cls = clusterRelation2list(data.frame(id = 1:length(data[[clusterCol]]), idFam = data[[clusterCol]]))
ps = NULL;
d0 = data;
# adaptive permutation
repeat {
ps0 = clapply(1:permute$Nchunk, function(i, f1, f0, data, type, clusterCol, cls, pvs){
d0[, pvs] = if (is.null(clusterCol)) d0[sample(1:(dim(data)[1])), pvs] else
d0[permuteClusters(cls), pvs];
r = regressionCompareModelsRaw(f1, f0, d0, type, clusterCol, ...);
r$p.value
}, f1 = f1, f0 = f0, data = data, type = type, clusterCol = clusterCol, cls = cls, pvs = pvs,
.clRunLocal = permute$.clRunLocal);
ps0 = na.exclude(as.numeric(ps0));
ps = c(ps, ps0);
#print(ps[1:100]);
p.emp = fraction(ps <= p.data);
# <p> stopping criterion
p.break = if (p.emp == 0) 1 / length(ps) else p.emp;
sdev.rel = sqrt(p.break * (1 - p.break) / length(ps)) / p.break;
#print(list(sd = sdev.rel * p.break, sd.rel = sdev.rel, p = p.emp));
if (sdev.rel <= permute$sdev.rel) break;
# <p> final stop
if (length(ps) >= Nmax) break;
};
r = list(f1 = f1, f0 = f0, p.value = p.emp, p.data = p.data, anova = a.data$anova, ps = ps);
r
}
# permute covariates in order to obtain empirical p-values
# f1: model formula alternative
# f0: model formula hypothesis
# M: number of permutations
regressionCompareModelsEmp = function(f1, f0, data, nuisanceCovariates = c(), type = "lm", M = 1e3, ...,
idName = "id", idClusterName = "cluster", .clRunLocal = T) {
r = regressionCompareModelsPermuted(f1, f0, type, ..., clusterCol = idClusterName, idCol = idName,
permute = list(Nchunk = M, nuisanceCovariates = nuisanceCovariates, .clRunLocal = .clRunLocal));
r
}
# data: data.frame
# stat: function computing test statistic
# vars: formula for permuation
# Nperm: number of permutations
# Pvalue: c('upper', 'lower', 'two.tailed')
permute = function(data, stat, vars, ..., Nperm = 5e3, Pvalue = 'lower', na.rm = T, fracBadStatThres = .01,
returnT = TRUE) {
perm.vars = all.vars(as.formula(vars));
f = function(i, ...) {
};
Ts = Sapply(0:Nperm, function(i, data, ...) {
if (i > 0) data[, perm.vars] = data[sample(nrow(data)), perm.vars];
stat(data, ...)
}, data = data, ...);
fracBadStatistics = mean(is.na(Ts[-1]));
if (is.na(Ts[1]) || fracBadStatistics >= fracBadStatThres) return(list(p.value = NA));
Ts = Ts[!is.na(Ts)];
Tdata = Ts[1];
Ts = Ts[-1];
Plower = (1 + sum(Ts <= Tdata)) / Nperm;
Pupper = (1 + sum(Ts >= Tdata)) / Nperm;
p.value = switch(Pvalue,
lower = Plower,
upper = Pupper,
two.tailed = 2 * min(Plower, Pupper)
);
r = if (returnT)
list(p.value = p.value, t.data = Tdata, t.perm = Ts) else
list(p.value = p.value, t.data = Tdata);
r
}
calibrate = function(f1, data, regression = glm, family = binomial(), quantiles = seq(0, 1, 0.25), ...) {
stat0 = regression(f1, data, family = family);
scores = predict(stat0, data, type = 'link');
ps = predict(stat0, data, type = 'response');
y = data[[formula.response(f1)]];
groups = quantile(scores, probs = quantiles, na.rm = FALSE);
calibration = lapply(2:length(quantiles), function(i) {
group = scores > groups[i - 1] & scores <= groups[i];
c(mean(y[group]), mean(ps[group]), sum(group))
});
c0 = Df(do.call(rbind, calibration),
row.names = paste0(sprintf('%.1f', quantiles[-1] * 100), '%'),
names = c('truth', 'prediction', 'N'));
r = list(calibration = c0, model = summary(stat0), quantiles = groups[-1], scores = scores);
r
}
#
# <p> P-values
#
alphaLevels = function(ps, Nalpha = 40, alphas = seq(1/Nalpha, 1 - 1/Nalpha, length.out = Nalpha)) {
lvls = setNames(sapply(alphas, function(alpha)mean(ps < alpha)), round(alphas, 2));
lvls
}
testSizes = function(ps, levels = seq(0, 1, length.out = 21), asDf = FALSE, scale = 1, round = NULL, diff = FALSE) {
sizes = sapply(levels, function(l) fraction(ps <= l));
if (diff) sizes = sizes - levels;
r = if (asDf)
Df(level = levels, size = sizes) else
SetNames(sizes, Sprintf("%{l}.1f%%", l = levels*100));
r = r * scale;
if (notE(round)) r = round(r, round);
return(r);
}
TestSizes = function(ps, levels = seq(0, 1, length.out = 21), asDf = FALSE, scale = 100, round = 2)
testSizes(ps, levels, asDf, scale, round)
testSizePlot = function(ps, ..., diagonal = 'red') {
psP = testSizes(ps, ..., asDf = TRUE);
ggplot(psP, aes(x = levels, y = size)) + geom_line() +
geom_abline(intercept = 0, slope = 1, col = diagonal)
}
#
# <p> error propagation
#
# as derived from the RB project and tested therein
errProd = function(x, sdx, y, sdy, covxy = 0) {
sdp = (x * y) * sqrt((sdx/x)^2 + (sdy/y)^2 + 2 * sdx * sdy * covxy);
sdp
}
errFrac = function(x, sdx, y, sdy, covxy = 0) {
sdp = (x / y) * sqrt((sdx/x)^2 + (sdy/y)^2 - 2 * sdx * sdy * covxy);
sdp
}
errSum = function(sdx, cx = 1, sdy = 0, cy = 1, covxy = 0) {
sds = sqrt((cx *sdx)^2 + (cy * sdy)^2 + 2 * cx * cy * covxy);
sds
}
#
# <§> some general statistical transformations
#
# convert confidence interval to standard dev based on a normality assumption
ciToSd_old = function(ci.lo, ci.up, level = .95) {
# upper centered limit
ciU = ci.up - mean(c(ci.lo, ci.up));
span = ci.up - ci.lo;
# corresponding sd
sd = Vectorize(inverse(function(s)qnorm(1 - (1 - level)/2, 0, s), interval = c(0, span * 8)))(ciU);
sd
}
ciToSd = function(ci.lo, ci.up, level = .95) {
if (missing(ci.up)) {
if (is.null(dim(ci.lo))) {
ci.up = ci.lo[2];
ci.lo = ci.lo[1];
} else {
ci.up = ci.lo[, 2];
ci.lo = ci.lo[, 1];
}
}
alpha = 1 - level;
widthStd = qnorm(1 - alpha/2) * 2;
(ci.up - ci.lo)/widthStd
}
ciToP = function(ci.lo, ci.up, level = .95, one.sided = F, against = 0) {
sd = ciToSd(ci.lo, ci.up, level)
P = peSdToP((ci.lo + ci.up)/2 - against, sd, one.sided);
P
}
# convert point estimate and SD to p-value (assuming normality)
betaSe2P = peSdToP = function(beta, sd, one.sided = F) {
pnorm(-abs(beta), 0, sd, lower.tail = T) * ifelse(one.sided, 1, 2);
}
betaSdToCi = ciFromBetaSdev = function(beta, sdev, level = .95) {
r = list(effect = beta,
lower = qnorm((1 - level)/2, beta, sdev, lower.tail = T),
upper = qnorm((1 - level)/2, beta, sdev, lower.tail = F)
);
r
}
ciFromSummary = function(s, var, level = .95) {
cs = as.matrix(coefficients(s)[var, ]);
ciFromBetaSdev(cs[, 'Estimate'], cs[, 'Std. Error'], level = level);
}
pFromBetaSd = function(beta, sd, null = 0)pnorm(null, abs(beta), sd)
sdFromBetaP = function(beta, p)Vectorize(inverse(function(s)peSdToP(beta, s), interval = c(0, 10)))(p);
betaPtoCi = function(beta, p) {
sd = sdFromBetaP(beta, p);
ciFromBetaSdev(beta, sd)
}
betaVarToCi = function(beta, var)ciFromBetaSdev(beta, sqrt(var))
coefficientsOR = function(s, level = .95) {
cis = do.call(cbind, ciFromSummary(s, level = level));
# <p> ORs
cisOR = Df_(exp(cis));
cisOR['(Intercept)', ] = NA; # intercept does not have an OR interpretation
names(cisOR) = paste(names(cisOR), 'OR', sep = '');
# <p> result
cfs = coefficients(s);
r = cbind(cfs, cis[, -1], as.matrix(cisOR));
r
}
coefficientsORmodel = function(model, level = .95)coefficientsOR(summary(model), level)
#
# meta analysis
#
# meta analysis row-wise
metaPvalue = function(ps) {
if (!is.matrix(ps)) ps = matrix(ps, nrow = 1);
if (!all(is.numeric(ps))) ps = apply(ps, 1:2, as.numeric);
cs = apply(ps, 1, function(r)sum(-2*log(r)));
psM = pchisq(cs, 2*dim(ps)[2], lower.tail = F);
psM
}
#
# data imputation
#
Sample = function(x, ...)if (length(x) == 1)x else sample(x, ...);
mi.simple = function(data, n.imp = 20) {
r = lapply(1:n.imp, function(i) {
for (v in names(data)) {
data[is.na(data[, v]), v] = Sample(na.omit(data[, v]), count(is.na(data[, v])));
}
data
})
r
}
cross.imputer = function(imputationData, imputationVars = NULL, doExpandFactors = T) {
if (is.null(imputationVars)) imputationVars = names(imputationData);
f = function(data) {
d0 = data;
for (v in imputationVars) { # cross impute from imputationData
d0[is.na(d0[, v]), v] = Sample(na.omit(imputationData[[v]]), count(is.na(d0[, v])));
}
if (doExpandFactors) d0 = dataExpandFactors(d0)[, vars];
d0
};
f
}
imputeMeanVar = function(col) {
mn = mean(col, na.rm = T);
col[is.na(col)] = mn;
col
}
imputeMean = function(data) {
d1 = apply(data, 2, imputeMeanVar);
d1
}
# impute using mice
imputeVariables = function(data, formula = NULL, vars = NULL, Nimp = 10, returnOriginal = FALSE,
imputationColumn = 'Imputation_', returnMice = FALSE) {
require('mice');
# <p> preparation
if (notE(formula)) vars = all.vars(formula.re(formula, data = data));
invalidVars = vars[!lapply(data[, vars], class) %in% c('factor', 'numeric')];
if (length(invalidVars) > 0) {
stop(Sprintf('some columns neither factor nor numeric: %{v}s', v = join(invalidVars, ', ')));
}
s = descriptives(data[, vars]);
# <p> imputation
dataImputed = mice(data[, vars, drop = F], m = Nimp);
dataL = Df_(complete(dataImputed, "long", include = returnOriginal),
min_ = c('.id'), headerMap = list(`.imp` = imputationColumn), as_numeric = imputationColumn);
dataC = DfStack(data, (Nimp + returnOriginal));
dataC[, vars] = Df_(dataL, min_ = imputationColumn);
dataR = Df(dataL[, imputationColumn], dataC, names = imputationColumn);
r = list(data = dataR, summary = s, vars = vars);
if (returnMice) r = c(r, list(mice = dataImputed));
#r = list(data = dataImputed, summary = s, vars = vars);
r
}
missingness = function(d, f0) {
ns = if (missing(f0)) names(d) else
if (class(f0) == 'formula') all.vars(f0) else f0;
r = lapply(ns, function(n) {
list(N = sum(is.na(d[[n]])), freq = mean(is.na(d[[n]])))
});
return(SetNames(do.call(rbind, r), rnames = ns));
}
Missingness = function(d, f0) {
dM = Df_(missingness(d, f0), deep_simplify_ = TRUE);
dM$variable = row.names(dM);
dM$perc = Sprintf('%{perc}.1f%%', perc = dM$freq * 100);
dM$freq = Sprintf('%{freq}.3f', dM);
row.names(dM) = NULL;
return(dM[, c('variable', 'N', 'freq', 'perc')]);
}
#
# <p> cross validation
#
# cross validation partitions for classification data
crossValidationPartitionsClassification = function(responses, K = 15, minEls = 3, maxTries = 15) {
N = length(responses);
cats = unique(responses);
for (i in 1:maxTries) {
# random permutation
perm = sample(1:N, N);
# compute partitions
parts = splitListEls(perm, K, returnElements = T);
counts = data.frame.types(
lapply(parts, function(p)table.n(responses[-p], categories = cats)),
names = cats, do.rbind = T
);
doReject = any(apply(counts, 1, function(r)any(r < minEls)));
if (!doReject) break;
}
r = if (i < maxTries) parts else {
Log("Error: failed to find suitable cross validation partition!");
NULL
}
r
}
# cross validation parititions for clustered data
# return indeces into cluster vector (cluster identities assumed to be given by integers)
# so far do not heed cluster sizes
crossValidationPartitionsClusters = function(clusters, K = 20) {
N = length(clusters);
# unique cluster ids
cls = unique(clusters);
# random permutation
perm = Sample(cls, length(cls));
# compute partitions
parts = splitListEls(perm, K, returnElements = T);
r = lapply(parts, function(p)which.indeces(p, clusters, match.multi = T));
r
}
#
# <p> optimization
#
nested.search = function(f, ..., key = NULL, parameters = list(p1 = c(0, 10)),
steps = 3, Ngrid = 4, rscale = 1, return.grid = F, par.as.vector = F, .clRunLocal = rget('.clRunLocal')) {
ps = ps0 = parameters;
for (i in 1:steps) {
# <p> create serach grid
pars = lapply(ps, function(p)seq(p[1], p[2], length.out = Ngrid));
grid = merge.multi.list(pars);
# <p> apply function
r = clapply(1:dim(grid)[1], function(j, grid, ...) {
args = if (par.as.vector) list(as.vector(grid[j, ]), ...) else c(as.list(grid[j, ]), list(...));
do.call(f, args);
}, grid = grid, ..., .clRunLocal = .clRunLocal);
# <p> search optimum
values = if (is.null(key)) r else list.kp(r, key, do.unlist = T);
opt = which.min(values * rscale);
pt = grid[opt, ]; # optimal point in the grid search
ps = lapply(1:length(ps), function(j){
from = max(pt[j] - (ps[[j]][2] - ps[[j]][1])/Ngrid, ps0[[j]][1]);
to = min(pt[j] + (ps[[j]][2] - ps[[j]][1])/Ngrid, ps0[[j]][2]);
c(from, to)
});
names(ps) = names(ps0);
}
r = if (return.grid) list(value = values[[opt]], par = pt, grid = r) else
list(value = values[[opt]], par = pt, r = r[[opt]]);
r
}
optim.nested.defaults = list(steps = 5, Ngrid = 4, rscale = 1, return.grid = F);
optim.nested = function(par = NULL, f, ..., lower = -Inf, upper = Inf, control = list())
with(merge.lists(optim.nested.defaults, control), {
parameters = apply(cbind(lower, upper), 1, function(r)list(r));
r = nested.search(f, ..., parameters,
steps = steps, Ngrid = Ngrid, rscale = rscale, return.grid = return.grid, par.as.vector = T);
r
})
#
# <p> correlation in data
#
Df.corr = function(df, eps = 1e-2) {
N = dim(df)[2];
rc = rcorr(df);
pairs = t(sapply(which(abs(rc$r) > (1 - eps)), function(e) {
row = ((e - 1) %/% N) + 1;
col = ((e - 1) %% N) + 1;
r = c(row, col);
r
}));
pairs = pairs[pairs[, 1] < pairs[, 2], ];
clusters = sub.graph(pairs);
remove = unlist(lapply(clusters, function(e)e[-1]));
r = list(clusters = clusters, cols.remove = remove);
r
}
identity = function(e)e
seq.transf = function(from = 0, to = 1, length.out = 1e1, ..., transf = log, transfI = exp, eps = 1e-5) {
s = transfI(seq(from = transf(from + eps), to = transf(to - eps), length.out = length.out, ...));
s
}
#
# <p> bug fixes for packages
#
completeRows = function(data)!apply(data, 1, function(r)any(is.na(r)))
completeData = function(f = ~ ., data, collapse = F, varExclude = c()) {
vsRaw = if (is.character(f)) f else all.vars(f);
if (any(vsRaw == '.')) vsRaw = names(data);
vs = setdiff(vsRaw, varExclude);
data[completeRows(data[, vs, drop = F]), if (collapse) vs else 1:ncol(data), drop = F]
}
model_matrix_from_formula = function(f, data, offset = NULL, ignore.case = F, remove.intercept = F,
returnComplete = F, formulaAsIs = F) {
# <p> prepare data matrices
f1 = if (formulaAsIs) f else formula.re(f, data = data, ignore.case = ignore.case);
f1vars = all.vars(f1);
response = formula.response(f1);
responseVars = all.vars(as.formula(con(response, ' ~ 1')));
# <p> complete data
complete = completeRows(data[, f1vars, drop = F]);
data = droplevels(data[complete, , drop = F]);
# <p> response
responseData = if (notE(responseVars)) with(data, Eval(response)) else NULL;
# <p> offset
offset = if (!is.null(offset)) offset[complete] else NULL;
# <p> model matrix
#mm = if (exists('lFormula') && length(unlist(Regex('[+]\\s*[(]', Deparse(f1)))) > 0)
mm = if (exists('lFormula') && length(unlist(Regex('[+]\\s*[(].*?[|].*[)]', Deparse(f1)))) > 0)
lFormula(f1, data)$X else # lme4
model.matrix(f1, model.frame(f1, data = data));
if (remove.intercept) mm = mm[, !(dimnames(mm)[[2]] == '(Intercept)'), drop = F];
# <p> return
r = list(mm = mm, response = responseData, offset = offset, indeces = which(complete),
varsCov = setdiff(f1vars, responseVars), varsResponse = responseVars);
if (returnComplete) r = c(r, list(data = data));
r
}
complete_from_formula = function(f, data, offset = NULL, ignore.case = F, remove.intercept = F) {
model_matrix_from_formula(f, data, offset, ignore.case, remove.intercept)$indeces
}
complete_from_vars = function(vars, data, offset = NULL, ignore.case = F, remove.intercept = F) {
f = as.formula(Sprintf('~ %{vars}s', vars = join(vars, ' + ')));
model_matrix_from_formula(f, data, offset, ignore.case, remove.intercept)$indeces
}
# <!><t> tb tested refactoring as of 19.2.2019
glmnet_re_mm = function(y, mm, ..., offset = NULL, ignore.case = F, remove.intercept = F,
lambdas = NULL, cv = T, returnGlmnet = F) {
# <p> fit model
r = if (cv) {
r0 = cv.glmnet(x = mm, y = y, lambda = lambdas, ..., offset = offset);
args = c(List(..., min_ = c('foldid', 'nfolds', 'grouped')),
list(x = mm, y = y, lambda = r0$lambda.min, offset = offset));
# list(x = mm, y = y, lambda = (3*r0$lambda.min + r0$lambda.1se)/4, offset = offset));
# list(x = mm, y = y, lambda = (r0$lambda.min), offset = offset));
do.call('glmnet', args);
} else glmnet(x = mm, y = y, lambda = lambdas, ..., offset = offset);
if (returnGlmnet) r = list(glmnet = r);
return(r);
}
glmnet_re = function(f, data, ..., offset = NULL, ignore.case = F, remove.intercept = F,
lambdas = NULL, cv = T, returnGlmnet = F) {
d = model_matrix_from_formula(f, data, offset, ignore.case, remove.intercept);
r = glmnet_re_mm(d$response, d$mm, ..., offset = d$offset,
ignore.case = ignore.case, remove.intercept = remove.intercept, lambdas = lambdas,
cv = cv, returnGlmnet = returnGlmnet);
return(c(r, list(formula = f)));
}
glmnet_re_refit = function(model, data, ..., var_cutoff = 1e-6, intercept = '1', impute = NULL) {
response = formula.response(model$formula);
if (model$df <= 1) return(list());
# <p> scrutinize model
coefs = model$beta;
varsSel = row.names(coefs)[abs(as.vector(coefs)) > var_cutoff];
varsSel = setdiff(varsSel, '(Intercept)');
if (!is.null(impute) && impute == 'mean') {
# <!> use model matrix <i>
d0 = sapply(varsSel, function(var) {
data[[var]][is.na(data[[var]])] = mean(data[[var]], na.rm = T);
});
data[, varsSel] = d0;
}
# <p> refit
f = as.formula(sprintf('%s ~ %s', response, paste(c(intercept, varsSel), collapse = ' + ')));
glm1 = glm(f, data = data, ...);
r0 = list(glm = glm1, score = as.vector(predict(glm1, data, type = 'link')))
r0
}
#library('glmnet');
grid.glmnet.raw = function(..., glmnet.f = cv.glmnet, max.tries = 3) {
for (i in 1:max.tries) {
fit = try(glmnet.f(...), silent = T);
if (all(class(fit) != 'try-error')) break();
}
if (any(class(fit) == 'try-error')) stop(fit[1]);
fit
}
grid.glmnet.control = list(steps = 4, Ngrid = 50, from = .01, to = .8, eps = 1e-5,
transf = identity, transfI = identity);
grid.glmnet = function(..., control = grid.glmnet.control)
with (merge.lists(grid.glmnet.control, control), {
# initialize
fit = NULL;
fromO = from;
toO = to;
options(warn = -1);
for (i in 1:steps) {
lambda = seq.transf(from, to, length.out = Ngrid + 1, eps = eps,
transf = transf, transfI = transfI);
fit = grid.glmnet.raw(..., lambda = sort(lambda, decreasing = T));
from = max(fit$lambda.min - (to - from)/Ngrid, 0);
to = fit$lambda.min + (to - from)/Ngrid;
}
options(warn = 0);
# choose lambdas to contain lambda.min also covering the range between from and to
lambda = c(
seq.transf(fromO, toO, length.out = Ngrid + 1, eps = eps,
transf = transf, transfI = transfI),
fit$lambda.min
);
fit0 = do.call('grid.glmnet.raw', c(list(...), list(lambda = sort(lambda, decreasing = T))));
args = List(..., min_ = c('nfolds', 'grouped'));
fit1 = do.call('grid.glmnet.raw', c(args, list(lambda = fit$lambda.min, glmnet.f = glmnet)));
r = fit0;
r$glmnet.fit = fit1;
r
})
# f: formula, passed through formula.re
# data: data frame
grid.glmnet.re = function(f, data, ..., offset = NULL, control = grid.glmnet.control,
ignore.case = F, remove.intercept = T)
with (merge.lists(grid.glmnet.control, control), {
# <p> prepare data matrices
f1 = formula.re(f, data = data, ignore.case = ignore.case);
f1vars = all.vars(f1);
response = formula.response(f1);
complete = !apply(data[, f1vars], 1, function(r)any(is.na(r)));
d1 = data[complete, ];
if (!is.null(offset)) offset = offset[complete];
mm = model.matrix(f1, model.frame(f1, data = d1));
if (remove.intercept) mm = mm[, !(dimnames(mm)[[2]] == '(Intercept)')];
# <p> fit model
r = grid.glmnet(x = mm, y = d1[[response]], ..., offset = offset, control = control);
r = c(r, list(formula = f1));
r
})
grid_glmnet_re_refit = function(model, data, ..., var_cutoff = 1e-6, intercept = '1', impute = NULL) {
# <p> scrutinize model
coefs = coefficients(model$glmnet.fit);
varsSel = row.names(coefs)[abs(as.vector(coefs)) > var_cutoff];
varsSel = setdiff(varsSel, '(Intercept)');
response = formula.response(model$formula);
if (!is.null(impute) && impute == 'mean') {
# <!> use model matrix <i>
d0 = sapply(varsSel, function(var) {
data[[var]][is.na(data[[var]])] = mean(data[[var]], na.rm = T);
});
data[, varsSel] = d0;
}
# <p> refit
f = as.formula(sprintf('%s ~ %s', response, paste(c(intercept, varsSel), collapse = ' + ')));
glm1 = glm(f, data = data, ...);
r0 = list(glm = glm1, score = as.vector(predict(glm1, data, type = 'link')))
r0
}
refitModel = function(model, f1, f0, data, ..., var_cutoff = 1e-6, ignore.case = F, intercept = '0') {
# <p> prepare formula and data set
f1 = formula.re(f1, data = data, ignore.case = ignore.case);
f0 = formula.re(f0, data = data, ignore.case = ignore.case);
f0covs = formula.covariates(f0);
f1vars = all.vars(f1);
response = formula.response(f1);
complete = complete.cases(data[, f1vars]); #!apply(data[, f1vars], 1, function(r)(any(is.na(r))));
d1 = data[complete, ];
# <p> extract data set according to model
coefs = coefficients(model);
varsSel = row.names(coefs)[abs(as.vector(coefs)) > var_cutoff];
varsSel = setdiff(varsSel, '(Intercept)');
varsSel0 = intersect(varsSel, f0covs);
if (!length(varsSel0)) return(
list(coefficients = coefs, anova = NA, r2 = NA, r20 = NA, raw = NA, model1 = NA, model0 = NA)
);
# <p> re-fit glm
f1 = as.formula(sprintf('%s ~ %s', response, paste(c(intercept, varsSel), collapse = ' + ')));
glm1 = glm(f1, data = d1, ...);
f0 = as.formula(sprintf('%s ~ %s', response, paste(c(intercept, varsSel0), collapse = ' + ')));
glm0 = glm(f0, data = d1, ...);
# <p> anova
a = anova(glm0, glm1, test = 'Chisq');
# <p> R^2
mn = mean(d1[[response]]);
#mm = model.matrix(f1, model.frame(f1, data = d1));
pr = as.vector(predict(glm1, d1, type = 'response'));
#r2 = cor((pr - mn), (d1[[response]] - mn));
r2 = cor(pr, d1[[response]]);
pr0 = as.vector(predict(glm0, d1, type = 'response'));
#r20 = cor((pr0 - mn), (d1[[response]] - mn));
r20 = cor(pr0, d1[[response]]);
# <p> raw-model fit
fScore = as.formula(sprintf('y ~ score + %s', paste(c(intercept, varsSel0), collapse = ' + ')));
d2 = data.frame(
d1[, varsSel0], y = d1[[response]], score = as.vector(predict(glm1, d1))
);
if (length(varsSel0)) names(d2)[1:length(varsSel0)] = varsSel0;
raw = glm(fScore, data = d2, ...);
r = list(coefficients = coefs, anova = a, r2 = r2, r20 = r20,
raw = summary(raw), model1 = glm1, model0 = glm0);
r
}
#
# <p> crossvalidation
#
# <!> tb implemented
cv_summary_lm = function(model, pred, data, ...) {
summary(r0)$fstatistic[1]
r = mean( (pred - data)^2 );
r
}
cv_test_glm = function(model, formula, data, ...) {
response = formula.response(formula);
responseP = predict(model, data, type = 'response');
responseD = data[[response]];
ll = sum(log(responseP));
ll
}
cv_train_glmnet = function(f1, data, ...) {
glmnet_re(f1, data, ..., returnGlmnet = T);
}
cv_test_glmnet = function(model, f1, data, ...) {
preds = predict(model$glmnet, model_matrix_from_formula(f1, data)$mm, type = 'response');
# detect logistic model, constrain probabilities
if (notE(model$glmnet$classnames) && length(model$glmnet$classnames) == 2) preds = minimax(preds, 0, 1);
return(preds);
}
cv_train_glmnet_mm = function(data, ...) {
i = FirstDef(which(dimnames(data)[[2]] == 'y'), 1);
glmnet_re_mm(data[, i], data[, -i], ..., returnGlmnet = T);
}
cv_test_glmnet_mm = function(model, data, ...) {
i = FirstDef(which(dimnames(data)[[2]] == 'y'), 1);
preds = predict(model$glmnet, data[, -i], type = 'response');
# detect logistic model, constrain probabilities
if (notE(model$glmnet$classnames) && length(model$glmnet$classnames) == 2) preds = minimax(preds, 0, 1);
return(preds);
}
cv_train_glm = function(f1, data, ..., family = binomial()) {
glm(f1, data = data, family = family)
}
cv_test_glm_predict = function(model, f1, data, ..., type = 'response') {
predict(model, data, type = type)
}
# cv_prepare = function(data, argsFrom...)
# cv_train = function(data, argsFrom...)
# cv_test = function(model, data, argsFrom...)
# @arg cv_fold number of crossvalidation folds, denotes leave -cv_fold out if negative
# <!> 19.2.2019: removed cv_prepare for lack of functionality
# <i> argument routing to train/test
crossvalidate = function(cv_train, cv_test,
data, cv_fold = 20, cv_repeats = 1, ..., parallel = F, align_order = TRUE) {
if (cv_fold == 0) stop('crossvalidate: cv_fold must be an integer != 0');
if (!parallel) Lapply = lapply;
N = nrow(data);
r = Lapply(1:cv_repeats, function(i, ...) {
perm = Sample(1:N, N);
# compute partitions
fold = if (cv_fold > 0) cv_fold else as.integer(N/-cv_fold);
parts = splitListEls(perm, fold, returnElements = T);
o = order(unlist(parts));
r = Lapply(parts, function(part, cv_train, cv_test, data, cv_repeats, ...) {
d0 = data[-part, , drop = F];
d1 = data[part, , drop = F];
model = cv_train(..., data = d0);
r = cv_test(model = model, ..., data = d1);
gc();
r
}, cv_train = cv_train, cv_test = cv_test,
data = data, cv_repeats = cv_repeats, ...);
# re-establish order
r = if (align_order
&& all(sapply(r, class) %in% c('numeric', 'integer'))
#&& all(sapply(r, length) == 1)) {
&& sum(sapply(r, length)) == nrow(data)) {
unlist(r)[o];
} else if (align_order && all(sapply(r, class) %in% c('data.frame', 'matrix')) &&
sum(sapply(r, nrow)) == nrow(data)) {
#<!> untested
#r = rbindDataFrames(r, colsFromFirstDf = T);
r = do.call(rbind, r);
r[o, ]
} else if (align_order) stop("Crossvalidate: didn't know how to align order.") else r;
gc();
r
}, ...);
r
}
#
# <p> data standardization
#
standardize = function (x, ...)UseMethod("standardize", x);
standardize.numeric = function(v, na.rm = TRUE, orig.mean = FALSE) {
vM = mean(v, na.rm = na.rm);
vC = v - vM;
vS = vC / sd(vC, na.rm = na.rm);
if (orig.mean) vS + vM else vS
}
standardize.data.frame = function(df, na.rm = TRUE)apply(df, 2, standardize.numeric, na.rm = na.rm);
standardize.matrix = function(m, na.rm = TRUE)apply(m, 2, standardize.numeric, na.rm = na.rm);
df2z = function(data, vars = names(as.data.frame(data))) {
data = as.data.frame(data);
df = data.frame.types(sapply(vars, function(v) {
(data[[v]] - mean(data[[v]], na.rm = T)) / sd(data[[v]], na.rm = T)
}), do.transpose = F);
i = which.indeces(vars, names(data));
d0 = data.frame(data[, -i], df);
d0
}
lumpFactor = function(factor, minFreq = NULL, minN = 20, levelPrefix = 'l') {
# <p> preparation
f0 = as.factor(factor);
t0 = table(f0);
ls = levels(f0);
N = length(f0);
if (!is.null(minFreq)) minN = as.integer(minFreq * N + 0.5);
# <p> lumping
map = listKeyValue(ls, ls);
for (i in 1:length(t0)) {
t0 = table(factor);
if (all(t0 >= minN) || length(t0) < 2) break;
# combine two smallest groups
t1 = sort(t0);
newLevel = sprintf('%s%d', levelPrefix, i);
factor = as.character(factor);
factor[factor == names(t1)[1] | factor == names(t1)[2]] = newLevel;
map[[names(t1)[1]]] = map[[names(t1)[2]]] = newLevel;
map[[newLevel]] = newLevel;
}
# <p> normalize map
lsNew = as.character(ls);
repeat {
lsNew0 = lsNew;
lsNew = as.character(map[lsNew]);
if (all(lsNew == lsNew0)) break;
}
return(list(map = listKeyValue(ls, lsNew), factor = factor));
}
# lump a variable after checking other variables for non-missingness
lumpVariableOnVariables = function(data, var, vars, postfix = '_lump', minN = 20) {
# prepare confounder afkomst
lump = sapply(vars, function(v) {
dvar = data[[var]][!is.na(data[[v]])];
lump = lumpFactor(dvar, minN = minN);
dvarNew = as.character(lump$map[as.factor(data[[var]])]);
dvarNew[dvarNew == 'NULL'] = NA;
as.factor(dvarNew)
});
d = data.frame(lump);
names(d) = paste(var, paste(vars, postfix, sep = ''), sep = '_');
d
}
#
# <p> descriptive
#
compareVectors = function(l) {
sets = names(l);
# marginals
r0 = nlapply(sets, function(n)c(n, length(l[[n]])));
r1 = nlapply(sets, function(n)c(sprintf('%s-unique', n), length(unique(l[[n]]))));
r2 = nlapply(sets, function(n)c(sprintf("%s-NA", n), sum(is.na(l[[n]]))));
modelList = list(A = sets, B = sets);
r3 = iterateModels(modelList, .constraint = function(A, B)(A < B), function(i, A, B) {
r = list(
c(sprintf("%s inter %s", A, B), length(intersect(l[[A]], l[[B]]))),
c(sprintf("%s union %s", A, B), length(union(l[[A]], l[[B]]))),
c(sprintf("%s min %s", A, B), length(setdiff(l[[A]], l[[B]]))),
c(sprintf("%s min %s", B, A), length(setdiff(l[[B]], l[[A]])))
);
r
}, lapply__ = lapply)$results;
r = c(r0, r1, r2, unlist.n(r3, 1));
r = data.frame.types(r, do.rbind = T, names = c('type', 'count'));
r
}
pairs_std.panel.hist <- function(x, ...) {
usr <- par("usr"); on.exit(par(usr)); # restore settings
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col = "grey", ...)
}
pairs_std.panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
usr <- par("usr"); on.exit(par(usr));
# <p> hack to get column names
nsAll = eval(expression(names(x)), parent.frame());
#nsV = nsAll[c(eval(expression(i), parent.frame()), eval(expression(j), parent.frame()))];
# <!> seems to be order (j, i) in terms of names 16.3.2021
nsV = nsAll[c(eval(expression(j), parent.frame()), eval(expression(i), parent.frame()))];
par(usr = c(0, 1, 0, 1))
c0 = cor.test(x, y);
Sx = summary(x);
Sy = summary(y);
Sums = RbindDfs(list(list2df(Sx), list2df(Sy)), embed = TRUE); # summaries
if (!('NA.s' %in% names(Sums))) Sums = Df(Sums, NA.s = NA);
#sum = apply(cbind(Sy, Sx), 1:2, sprintf, fmt = '%.3f');
sum = apply(Sums, 1:2, sprintf, fmt = '%.3f');
ns = c('Min', '1st Q', 'Median', 'Mean', '3rd Q', 'Max', 'NAs');
#ns = names(summary(x));
txt <- paste0(prefix,
sprintf("Cor: %.2f (%.2f, %.2f)\n", c0$estimate, c0$conf.int[1], c0$conf.int[2]),
sprintf("P: %.2e\n", c0$p.value),
sprintf("Vars: %s %s\n", nsV[1], nsV[2]),
#join(apply(cbind(ns, sum), 1, join, ' '), "\n")
join(apply(cbind(ns, t(sum)), 1, join, ' '), "\n")
)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(.9, 0.5, txt, adj = c(1, .5), cex = cex.cor * 1) # used tb cex.cor * r
}
unique_fraction = function(x) (length(unique(x))/length(x))
guess_is_factor = function(x, na.rm = T, uniqueFraction = .1, Nlevels = 10) {
all(as.integer(x) == x, na.rm = na.rm) &&
unique_fraction(x) < uniqueFraction &&
length(unique(x)) <= Nlevels
}
panel.scatter = function(x, y, col = par("col"), bg = NA, pch = par("pch"),
cex = 1, col.smooth = "red", span = 2/3, iter = 3, blotSize = cm2in(.3), ...)
{
#call_ = match.call(envir = parent.frame(1L));
# it is difficult to get information on the plotted variables, for now a heuristics is used
# to determine factor status
if (guess_is_factor(x) && guess_is_factor(y)) {
pars = par(c('usr', 'xaxt', 'yaxt')); on.exit(par(pars));
t0 = table(x, y);
#plot(t0);
mx = max(as.integer(t0));
d0 = Df(merge.multi.list(dimnames(t0)), w = sqrt(as.integer(t(t0))), as_numeric = c('x', 'y'));
#par(new = TRUE);
#plot(d0[, c('x', 'y')], type = 'n', xaxt='n', yaxt = 'n');
par(new = TRUE);
with(d0,
symbols(x = x, y = y, circles = w, inches = blotSize, ann = F, bg = "steelblue2", fg = NULL,
xaxt='n', yaxt = 'n')
)
#return();
} else {
# <i> call forwarding
panel.smooth(x, y, col, bg, pch, cex, span, iter);
}
}
pairs_std = function(...,
lower.panel = panel.scatter, diag.panel = pairs_std.panel.hist, upper.panel = pairs_std.panel.cor) {
pairs(...,
lower.panel = lower.panel, diag.panel = diag.panel, upper.panel = upper.panel)
}
pairs_std_save = function(data, ..., path = NULL) {
if (is.null(path)) path = tempfile(fileext = '.png');
plot_save_eval(pairs_std(data), ..., plot_path = path);
return(path);
}
pairs_wide_list = function(blocks = NULL, NperBlock = 8) {
# <p> names
ns = names(blocks);
nsAll = unlist(blocks);
blkNs = if (notE(ns)) ns else paste('Block', seq_along(blocks));
# <p> block tiling
Iblocks = seq_along(blocks);
blocksPair = merge.multi(Iblocks, Iblocks);
# <p> reorder
Idiag = blocksPair[, 1] == blocksPair[, 2];
blocksPair = rbind(blocksPair[Idiag, ], blocksPair[!Idiag, ]);
# <p> tabnames
pairNs = apply(blocksPair, 1, function(r)paste(unique(c(blkNs[r[1]], blkNs[r[2]])), collapse = ' - '));
# <p> indeces
horIdcs = lapply(blocksPair[, 1], function(b)which.indeces(blocks[[b]], nsAll));
verIdcs = lapply(blocksPair[, 2], function(b)which.indeces(blocks[[b]], nsAll));
# <p> by pair of blocks
r = lapply(1:nrow(blocksPair), function(i)list(
coord = blocksPair[i, ], name = pairNs[i],
names = list(blocks[[ blocksPair[i, 1] ]], blocks[[ blocksPair[i, 2] ]]),
horIdcs = horIdcs[[blocksPair[i, 1]]], verIdcs = verIdcs[[blocksPair[i, 2]]]
));
return(r);
}
pairs_wide_old = function(data, ..., blocks = NULL, NperBlock = 8, consumer = Identity1) {
nsPairs = unlist(blocks);
dPairs = data[, nsPairs];
Iblocks = seq_along(blocks);
blocksPair = merge.multi(Iblocks, Iblocks);
plts = apply(blocksPair, 1, function(r) {
horInd = which.indeces(blocks[[ r[1] ]], nsPairs);
verInd = which.indeces(blocks[[ r[2] ]], nsPairs);
pl = consumer(pairs_std(dPairs, horInd = horInd, verInd = verInd), r);
return(list(pl))
});
return(unlist.n(plts, 1));
}
pairs_wide_single = function(data, b) { pairs_std(data, horInd = b$horIdcs, verInd = b$verIdcs); }
pairs_wide = function(data, ..., blocks = NULL, NperBlock = 8, consumer = Identity1) {
prs = pairs_wide_list(blocks, NperBlock);
dPairs = data[, unlist(blocks)];
plts = lapply(prs, function(b)list(consumer(pairs_wide_single(dPairs, b), b)));
return(unlist.n(plts, 1));
}
pairs_wide_save = function(data, ..., blocks = NULL, path = NULL, ext = 'jpeg') {
prefix = if (is.null(path)) tempfile() else path;
r = pairs_wide(data, ..., blocks = blocks, consumer = function(plot, b) {
plot_save_eval(plot, plot_path = Sprintf('%{prefix}s-%{name}s.%{ext}', b));
});
return(r);
}
#
# <p> omics data
#
quantileData = function(d, p) {
dS = sort(d);
q = dS[ceiling(p * length(dS))];
q
}
quantileReference = function(reference, direction = 2, center = TRUE) {
if (is.matrix(reference) && center) {
refC = matrixCenter(reference, direction);
reference = matrixDeCenter(refC$matrix, mean(refC$center), direction);
}
ref = na.omit(as.vector(as.matrix(reference)));
ref
}
Skewness = function(x, na.rm = T) {
x0 = if(na.rm) na.omit(x) else x;
N = length(x0);
x1 = x0 - mean(x0)
y = sqrt(N) * sum(x1^3) / (sum(x1^2)^(3/2))
s = y * ((1 - 1/N))^(3/2);
s
}
Noutliers = function(x, coef = 1.5)length(boxplot.stats(x, coef = coef)$out)
#' Quantile normalization of frame/matrix with respect to reference distribution
#'
#' Distribution to be normalized are represented as columns or rows of a matrix/data frame.
#' Each value is replaced by the quantile of the reference distribution as given by the value of the
#' empirical distribution function of the given value.
#'
#' @param reference numeric vector with realizations from the target distribution
#' @param data data frame or matrix with data to be normalized
#' @param direction is \code{data} organized per row or column?
#'
#' @examples
#' d = sapply(1:20, rnorm(1e4));
#' dNorm = quantileNormalization(as.vector(d), d)
quantileNormalization = function(reference, data, direction = 2,
impute = TRUE, ties = 'random', center = TRUE, referenceDirection = direction) {
ref = quantileReference(reference, referenceDirection, center);
if (impute) mns = apply(data, 3 - direction, median, na.rm = T);
dN = apply(data, direction, function(d) {
d0 = d;
if (impute) d[is.na(d0)] = mns[is.na(d0)];
r = quantile(ref, probs = rank(d, na.last = 'keep', ties = ties)/length(na.omit(d)))
if (impute) r[is.na(d0)] = NA;
r
});
if (direction == 1) dN = t(dN);
dimnames(dN) = dimnames(data);
dN
}
# quantile normalization based on samples picked on the basis of their medians (around the medians)
# Nqn: number of reference samples
quantileNormalizationMedians = function(data, direction = 2, Nqn = 5, impute = TRUE) {
# <p> determine median of medians, corresponding median, IQR
medians = apply(data, direction, median);
mediansO = order(medians);
medianOI = as.integer(length(mediansO)/2 + .5);
medianI = mediansO[medianOI];
refMed = summary(data[, medianI]);
refIQR = refMed[['3rd Qu.']] - refMed[['1st Qu.']];
# <p> reference samples
refL = as.integer(medianOI - Nqn/2 + .5);
refU = refL + Nqn - 1;
refSamples = mediansO[refL:refU];
#print(refSamples)
# <p> standardize reference samples wrt median, IQR
refSampleValues = sapply(refSamples, function(i) {
refI = summary(data[, i]);
refIIQR = refI[['3rd Qu.']] - refI[['1st Qu.']];
E = (data[, i] - refI[['Median']]) * refIQR/refIIQR + refMed[['Median']];
#refIE = summary(E);
#refIEIQR = refIE[['3rd Qu.']] - refIE[['1st Qu.']];
#print(list(refI = refI, refIIQR = refIIQR, refIE = refIE, refIEIQR = refIEIQR));
E
});
eQn = quantileNormalization(refSampleValues, data,
direction = direction, impute = impute, center = FALSE);
eQn
}
dataCentered = function(d, na.rm = T) {
dC = apply(d, 2, function(col)col - mean(col, na.rm = na.rm));
dC
}
#
# <p> distributions
#
qtruncnorm = function(p, mean = 0, sd = 1, lower = -Inf, upper = Inf) {
plow = pnorm(lower, mean, sd);
pupp = pnorm(upper, mean, sd);
qnorm(p * (pupp - plow) + plow, mean, sd = sd)
}
rtruncnorm = function(N, mean = 0, sd = 1, lower = -Inf, upper = Inf) {
qtruncnorm(runif(N), mean, sd = sd, lower, upper)
}
qqDist = function(Nqts = 1e2, qdist, ...) {
qtls = (1:Nqtls)/(Nqtls + 1);
qtlsExp = qdist(qtls, ...);
qtlsObs = quantile(rtn, qtls);
qq = qplot(qtlsExp, qtlsObs) + theme_bw();
qq
}
qqSim = function(Nsim, dist = 'truncnorm', Nqts = Nsim/10, ...) {
require('ggplot2');
rdist = get(Sprintf('r%{dist}s'));
r = rdist(Nsim, ...);
qdist = get(Sprintf('q%{dist}s'));
qq = qqDist(Nqts, qdist, ...);
qq
}
#
# <p> entropy
#
table.entropy = function(d) {
p = table.freq(d);
p = p[p != 0];
H = - sum(p * log(p));
}
#
# <p> qvalue
#
Qvalue = function(P.value, ...) {
Require('qvalue');
P.valuesNotNA = na.omit(P.value);
qv = qvalue(P.valuesNotNA, ...);
r = qv;
r$qvalue = vector.embed(rep(NA, sum(is.na(P.value))), which(!is.na(P.value)), qv$qvalue);
r
}
#
# <p> math functions
#
Ceiling = function(x, N = 0)(ceiling(x * 10^N) / 10^N)
Floor = function(x, N = 0)(floor(x * 10^N) / 10^N)
#
# <p> descriptive statistics
#
SummaryColNum = function(col, qtls = c(0, 0.25, .5, .75, 1)) {
qAlways = c(0, .25, .5, .75, 1);
Qtls = sort(unique(c(qAlways, qtls)));
qs = quantile(col, Qtls, na.rm = TRUE);
mn = mean(col, na.rm = T);
na = fraction(is.na(col));
Sd = sd(col, na.rm = T);
median = qs[which(qtls == .5)];
qs1 = qs[which.indeces(setdiff(Qtls, qAlways), Qtls)]
N = sum(!is.na(col));
vs = c(mn, Sd, N, na, median,
qs[which(qtls == .25)], qs[which(qtls == .75)],
qs[which(qtls == 0)], qs[which(qtls == 1)], qs1);
r = Df_(t(vs),
names = c('mean', 'sd', 'N', 'NA', 'median', 'Q25', 'Q75', 'min', 'max', names(qs1)),
row.names = attr(col, 'label'));
return(r);
}
SummaryColFactor = function(col) {
na = vector.named(fraction(is.na(col)), 'NA');
na
}
SummaryCatCol = function(col) {
if (class(col) != 'factor') return(NULL);
tc = t2r(table(col))[, 1];
N = sum(tc)
rn = paste(attr(col, 'label'), names(tc), sep = '.');
#rn = names(tc);
ns = c('mean', 'sd', 'N', 'NA');
rTot = Df(NA, NA, N, mean(is.na(col)), row.names = attr(col, 'label'), names = ns);
#rTot = Df(NA, NA, N, mean(is.na(col)), names = ns);
freq = tc / N;
sd = sqrt(freq * (1 - freq) / N);
rLev = Df(freq, sd, tc, NA, row.names = rn, names = ns)
r = rbind(rTot, rLev);
r
}
SummaryCol = function(col, qtls = c(0, 0.25, .5, .75, 1)) {
r = if (class(col) %in% c('numeric', 'integer')) SummaryColNum(col, qtls) else
if (class(col) == 'factor') SummaryCatCol(col) else
stop("Could not summarize column");
r
}
# attach attribute label as column name to all columns
DfLabel = function(d) {
return(Df(nelapply(d, function(n, col) {
attr(col, 'label') = n;
return(col);
})));
}
Summary = function(d) {
r0 = lapply(DfLabel(d), SummaryCol)
r1 = RbindDfs(SetNames(r0, NULL), embed = TRUE);
return(r1);
}
# summary of categorical data
SummaryCat = function(d) {
r0 = lapply(DfLabel(d), SummaryCatCol)
r = do.call(rbind, r0);
row.names(r) = NULL
r
}
SummaryTestCat = function(col, By, B = 1e4) {
catTab = table(Df(col, By));
P = chisq.test(catTab)$p.value;
PnonPar = chisq.test(catTab, B = 1e4)$p.value;
return(Df(P = P, PnonPar = PnonPar, row.names = attr(col, 'label')));
}
SummaryTestNum = function(col, By) {
#grps = by(col, By, identity); # --> t test
rAnova = summary(aov(y ~ group, Df(y = col, group = By)));
P = rAnova[[1]]['group', 'Pr(>F)'];
PnonPar = kruskal.test(col, By)$p.value;
return(Df(P = P, PnonPar = PnonPar, row.names = attr(col, 'label')));
}
SummaryTest = function(col, by) {
r = if (class(col) == 'numeric') SummaryTestNum(col, by) else
if (class(col) == 'factor') SummaryTestCat(col, by) else
stop("Could not summarize column");
r
}
SummaryBy = function(d, By, summaryVars) {
d = DfLabel(d);
if (missing(summaryVars)) summaryVars = setdiff(names(d), By);
r = by(d, d[[By]], function(d0) {
Summary(d0[, summaryVars, drop = F])
});
r = c(list(all = Summary(Df_(d, min_ = By))), r);
tests = lapply(summaryVars, function(n)SummaryTest(d[[n]], d[[By]]));
dfTests = do.call(rbind, tests);
tab = do.call(cbind, r);
tabTests = MergeByRowNames(tab, dfTests, all = TRUE);
return(list(tab = tab, tabsBy = r, tabTests = tabTests, test = dfTests));
}
descriptivesNumeric = function(data) {
bp = apply(data, 2, boxplot.stats);
N = list.kp(bp, 'n', do.unlist = T);
r = data.frame(
N = N,
Nna = nrow(data) - N,
mean = apply(data, 2, mean, na.rm = T),
median = apply(data, 2, median, na.rm = T),
se = apply(data, 2, sd, na.rm = T) / sqrt(N),
Nout = sapply(bp, function(v)length(v$out))
);
r
}
descriptivesFactor = function(data) {
Nnas = apply(data, 2, function(c)sum(is.na(c)));
r = data.frame(
N = nrow(data) - Nnas,
Nna = Nnas,
Ncat = sapply(lapply(data, levels), length)
);
r
}
descriptives = function(data) {
r1 = descriptivesFactor(data[, DfClasses(data) == 'factor']);
r2 = descriptivesNumeric(data[, DfClasses(data) == 'numeric']);
r0 = if (nrow(r2) == 0) list(r1) else if (nrow(r1) == 0) list(r2) else list(r1, r2);
r = listOfDataFrames2data.frame(r0, idColumn = NULL, colsFromUnion = T, row.names = T);
r$`NAperc` = with(r, Nna / nrow(data) * 100);
r
}
#
# <p> survival
#
# adapted from survival:::print.survfit
survfit2m = function (x, scale = 1, ..., rmean = 'none') {
if (inherits(x, "survfitms")) {
x$surv <- 1 - x$prev
if (is.matrix(x$surv)) dimnames(x$surv) <- list(NULL, x$states)
if (!is.null(x$lower)) {
x$lower <- 1 - x$lower
x$upper <- 1 - x$upper
}
rmean <- "none"
}
omit <- x$na.action
temp <- survival:::survmean(x, scale = scale, rmean)
mtemp <- if (is.matrix(temp$matrix))
temp$matrix
else matrix(temp$matrix, nrow = 1, dimnames = list(NULL,
names(temp$matrix)))
if (all(mtemp[, 2] == mtemp[, 3])) {
cname <- dimnames(mtemp)[[2]]
mtemp <- mtemp[, -2, drop = FALSE]
cname <- cname[-2]
cname[2] <- "n"
dimnames(mtemp)[[2]] <- cname
}
if (all(mtemp[, 1] == mtemp[, 2]))
mtemp <- mtemp[, -1, drop = FALSE]
temp$matrix <- drop(mtemp)
temp$matrix
}
#
# <p> mixed models
#
lmer2ci = function(object, method = 'profile')confint.merMod(object, method = method);
lmer2p = function(object, method = 'profile') {
cis = lmer2ci(object, method = method);
apply(cis, 1, function(ci)ciToP(ci[1], ci[2]))
}
polr2p = function(rPoly) {
ci = t2r(confint(rPoly));
cfs = coefficients(summary(rPoly));
cfs1 = cbind(cfs, P = c(ciToP(ci[, 1], ci[, 2]), rep(NA, nrow(cfs) - nrow(ci))));
return(cfs1);
}
# confint plus P-value
confintP = function(m, level = .95) {
ci = confint(m, level = level);
P = apply(ci, 1, function(ci)ciToP(ci[1], ci[2], level = level));
cbind(ci, P)
}
coefficientsLmer = function(object, method = 'Wald') {
cfs = coefficients(summary(object));
P = lmer2p(object, method);
Iinter = which(names(P) == '(Intercept)');
r = cbind(cfs, P = P[Iinter:length(P)]);
return(r);
}
# alternative check for convergence of glmer models
#https://stackoverflow.com/questions/21344555/convergence-error-for-development-version-of-lme4
glmerFit = function(m) {
relgrad = with(m@optinfo$derivs, solve(Hessian, gradient));
return(max(abs(relgrad)));
}
GlmerFit = function(m, cutoff = 1e-3)(glmerFit(m) < cutoff)
#
# <p> simulation studies
#
simulationRun = dataSimulation = function(Niteration, fSim, fStat = function(data, ...)summary(data),
..., parSim = list(), parStat = list()) {
parShared = list(...);
r = Lapply(1:Niteration, function(i) {
data = do.call(fSim, c(parSim, parShared));
r = do.call(fStat, c(list(data = data), parStat, parShared));
});
}
simulationRunBySpec = function(fSim, fStat = function(data, ...)summary(data),
spec = list(Nsim = 1e1, sim = list(), stat = list(), pars = list())) {
args = c(
list(Niteration = spec$Nsim, fSim = fSim, fStat = fStat,
parSim = spec$sim, parStat = spec$stat),
spec$pars
);
r = do.call(simulationRun, args);
return(r);
}
simulationDataBySpec = function(fSim,
spec = list(Nsim = 1e1L, sim = list(), stat = list(), pars = list())) {
data = do.call(fSim, c(spec$pars, spec$sim));
return(data);
}
simulationStatBySpec = function(fTest, data,
spec = list(Nsim = 1e1L, sim = list(), stat = list(), pars = list())) {
r = do.call(fTest, c(list(data = data), spec$pars, spec$stat));
return(r);
}
#
# <p> delta rule
#
#
# Numeric delta rule
#
gradient = function(f, x, ..., eps = 1e-5) {
sapply(seq_along(x), function(i) {
epsI = vector.assign(0, i, eps, N = length(x));
(f(x + epsI, ...) - f(x - epsI, ...)) / (2 * eps)
});
}
totalDeriv = function(f, x, ..., eps = 1e-5) {
y = f(x, ...);
t(sapply(seq_along(y), function(i)gradient(function(x)f(x, ...)[i], x, eps = eps)));
}
deltaNPseudo = function(N, pseudo = 0) {
if (is.list(N)) N = do.call(cbind, N);
if (pseudo != 0) {
N[1, ] = N[1, ] + pseudo;
N[2, ] = N[2, ] + 2*pseudo;
}
return(N);
}
# delta rule for functions of binomial data
#N: list of pairs Nevents, Ntotal, or nx2 matrix of these numbers
#f: function to be computed on the frequencies
deltaFreqs = function(N, f, logScale = T, ..., eps = 1e-5, alpha = .95, pseudo = 0) {
#if (is.list(N)) N = sapply(N, identity);
N = deltaNPseudo(N, pseudo);
freqs = N[1, ] / N[2, ];
var = freqs * (1 - freqs) / N[2, ];
fDeriv = if (logScale) function(x, ...)log(f(x, ...)) else f;
fGradient = gradient(fDeriv, freqs, ..., eps = eps);
varf = (t(fGradient) %*% diag(var) %*% fGradient)[1, 1];
limit = qnorm(1 - (1 - alpha)/2, 0, sqrt(varf));
y = fDeriv(freqs, ...);
yCi = y + c(-limit, limit);
r = if (logScale)
list(y = exp(y), ci = exp(yCi), var = varf, se = sqrt(varf), logCi = yCi) else
list(y = y, ci = yCi, var = varf, se = sqrt(varf));
r
}
delta = function(x, Cov, f, logScale = F, ..., eps = 1e-5, alpha = .95) {
fDeriv = if (logScale) function(x, ...)log(f(x, ...)) else f;
fGradient = gradient(fDeriv, x, ..., eps = eps);
varf = (t(fGradient) %*% Cov %*% fGradient)[1, 1];
y = fDeriv(x, ...);
limit = qnorm(1 - (1 - alpha)/2, 0, sqrt(varf));
yCi = y + c(-limit, limit);
r = if (logScale)
list(y = exp(y), ci = exp(yCi), var = varf, se = sqrt(varf), logCi = yCi) else
list(y = y, ci = yCi, var = varf, se = sqrt(varf));
r
}
# return freq-vector, cov matrix
deltaMultinomFromCnts = function(counts) {
N = sum(counts);
freqs = counts / N;
Var = freqs * (1 - freqs);
Cov = freqs %*% t(freqs);
diag(Cov) = Var;
return(list(par = freqs, cov = Cov));
}
# compose lists of par/cov structures
deltaCompose = function(parCov) {
return(list(par = list.kpu(parCov, 'par'), cov = .bdiag(list.kp(parCov, 'cov'))));
}
#
# <p> sensitvity/specificity/ROC/AUC
#
# assume TRUE/FALSE
# tab = Table(list(pred = prob >= co, truth = labels), cats = list(pred = c(F, T), truth = c(F, T)))
# Table(list(pred = c(T, F, T, F), truth = c(T, T, F, F)), cats = list(pred = c(F, T), truth = c(F, T)))
binaryMeasure = function(tab, strict = TRUE) {
if (strict && !all(names(dimnames(tab)) == c('pred', 'truth'))) stop('dimnames != pred, truth');
if (strict && !all(unique(unlist(dimnames(tab))) == c('FALSE', 'TRUE'))) stop('values != FALSE, TRUE');
r = list(
tp = sum(tab[, 'TRUE']) / sum(tab),
tn = sum(tab[, 'FALSE']) / sum(tab),
sensitivity = tab['TRUE', 'TRUE'] / sum(tab[, 'TRUE']),
specificity = tab['FALSE', 'FALSE'] / sum(tab[, 'FALSE']),
ppv = tab['TRUE', 'TRUE'] / sum(tab['TRUE', ]),
npv = tab['FALSE', 'FALSE'] / sum(tab['FALSE', ])
);
return(r);
}
binaryMeasures = function(prob, labels) {
cutoffs = sort(unique(prob));
measures = lapply(cutoffs, function(co) {
tab = Table(list(pred = prob >= co, truth = labels), cats = list(pred = c(F, T), truth = c(F, T)));
bm = c(list(cutoff = co), binaryMeasure(tab));
bm
});
r = apply(do.call(rbind, measures), 2, unlist);
r
}
cvROC = function(f1, data,
cv_fold = 10, cv_repeats = 1, cv_train = cv_train_glmnet, cv_test = cv_test_glmnet) {
Library('pROC');
Library('AUC');
# <p> data preparation
mm = model_matrix_from_formula(f1, data);
d0 = data[mm$indeces, ];
d0resp = d0[[formula.response(f1)]];
# <p> crossvalidation
pred = crossvalidate(cv_train, cv_test, data = d0, cv_fold = cv_fold, cv_repeats = cv_repeats, f1 = f1);
# <p> results
# <i><N> only first repeat headed at the moment
roc = AUC::roc(pred[[1]], as.factor(d0resp));
auc = AUC::auc(roc);
rocProc = pROC::roc(as.factor(d0resp), pred[[1]]);
aucProc = pROC::auc(rocProc);
P = cor.test(d0resp, pred[[1]])
r = list(roc = roc, auc = auc, rocProc = rocProc, aucProc = aucProc, aucCi = ci(rocProc), P = P,
data = Df(response = d0resp, prediction = pred[[1]]));
return(r);
}
plotRocs = function(data, interval = 0.2, breaks = seq(0, 1, interval)) {
d0 = do.call(rbind, nlapply(data, function(n) {
d = if (all( c('tpr', 'fpr') %in% names(data[[n]]))) data[[n]] else # roc already computed
with(data[[n]], AUC::roc(prediction, as.factor(response)));
Df(fpr = 1 - d$fpr, tpr = d$tpr, type = n)
}));
p = ggplot(data = d0) +
#geom_line(aes(x = fpr, y = tpr)) +
geom_step(aes(x = fpr, y = tpr, col = type), lwd = 1) +
geom_segment(aes(x = 0, y = 1, xend = 1, yend = 0)) + #diagonal
scale_x_reverse(name = "Specificity", limits = c(1,0), breaks = breaks, expand = c(0.001,0.001)) +
scale_y_continuous(name = "Sensitivity", limits = c(0,1), breaks = breaks, expand = c(0.001, 0.001))+
labs(color = 'ROC type') +
coord_equal() + theme_bw();
p
}
#
# <p> multiple testing
#
tailStrength = function(ps, M = length(ps)) {
N = length(ps);
# standardize p-values to 0
psS = 1 - sort(ps) * (N + 1)/(1:N);
r = - sum(psS[1:M])/M; # return negative as small values are deemed relevant
r
}
#
# <p> power calculations
#
findBinomN = function(pTrue, pLower, level = 0.95, Nmax = 1e3, lower = T) {
Library('binom');
fBinom = if (lower)
function(N)binom.logit(N*pTrue, N, conf.level = level)$lower else
function(N)binom.logit(N*pTrue, N, conf.level = level)$upper
inverse(fBinom, c(1, Nmax))(pLower);
}
#
# <p> pseudo counts
#
# implement pseudo-count, needed as observations c(0, 0, X) cause diverging estimates
mulitnomCoeff = function(x, log = TRUE) {
N = sum(x);
muco = lgamma(N + 1) - sum(sapply(x + 1, lgamma));
if (!log) muco = exp(muco);
return(muco);
}
dmultinomPseudo = function(x, prob, log = FALSE, pseudo = .1, factors = 1:1e2, F = NULL) {
# find multiplication factor
if (is.null(F)) {
pseudoF = t(t_(pseudo) %*% factors);
pseudoR = abs(pseudoF - round(pseudoF));
F = factors[apply(pseudoR, 2, which.min)];
}
# standardize probabilities
logP = log(v2freq(prob));
# raw likelihood contributions
pRaw = sum(round((x + pseudo) * F) * logP / F);
# add multinomial coefficient
pCooked = pRaw + mulitnomCoeff(x);
if (!log) pCooked = exp(pCooked);
return(pCooked);
}
#
# <p> random numbers
#
generateSeed = function(preSeed = Sys.time()) {
ints = hex2ints(md5sumString(as.integer(preSeed)));
r = 0
for (i in 1:length(ints)) r = bitwXor(r, ints[i]);
return(r);
}
# run expression with fixed seed but leave the RNG stream unchanged
# (bracket expression with save/restore of seed)
# as.integer(Sys.time())
fixSeedForExpr = function(seed, expr, envir = parent.frame()) {
if (!exists('.Random.seed')) runif(1); # vivify RNG
currentSeed = .Random.seed;
on.exit(.Random.seed <<- currentSeed);
set.seed(seed);
r = eval(expr, envir = envir);
#.Random.seed <<- currentSeed;
return(r);
}
#
# <p> general distributions
#
lchoosemult = function(x) {
N = sum(x);
K = length(x);
lmult = dmultinom(x, prob = rep(1, K), log = T) + N*log(K);
return(lmult);
}
#
# <p> sampling
#
# individual probabilities, sample single event
# P: N x (K - 1) matrix with class probabilities
# size == 1 only supported case
rmultinomial = function(P, size = 1) {
Ps = t2r(apply(P, 1, cumsum));
R = runif(nrow(P)); # case of size == 1
y = apply(apply(Ps, 1, function(P)R > P), 1, function(i)max(c(0, which(i))));
return(y);
}
rbinomial = function(P, Nbinom = 1) {
# binomial case for Nbinom >= 1
P = t_(sapply(P, function(p)cumsum(dbinom(0:(Nbinom - 1), Nbinom, p))));
return(rmultinomial(P));
}
#
# <p> special data transformation
#
# partial long -> wide transformation by putting baseline value into separate column
dataSplitBaseline = function(dataN, valueColumn = 'RATEHEALTHTODAY',
timeColumn = 'TIMINGASSDR1', baselineValue = 'Baseline', idColumn = 'STUDYCODE', suffix = 'bl') {
valueColBl = paste0(valueColumn, suffix);
dataBL0 = by(dataN, dataN[[ idColumn ]], function(d0) {
Ibaseline = which(d0[[ timeColumn ]] == baselineValue & !is.na(d0[[ timeColumn ]]));
if (length(Ibaseline) > 1)
warning(Sprintf('Id %{id}s with %{N}d baseline values. Chosing first.', id = d0[[ idColumn ]][1], N = length(Ibaseline)));
d1 = if (length(Ibaseline) >= 1) d0[- Ibaseline, , drop = F] else d0;
val = if (length(Ibaseline) >= 1) d0[[ valueColumn ]][ Ibaseline[1] ] else NA;
if (nrow(d1) == 0) {
return(NULL);
}
d = Df(baselineVar.. = val, d1, headerMap = list(baselineVar.. = valueColBl));
return(d);
});
dataBL = Df_(do.call(rbind, dataBL0));
return(dataBL);
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.