Nothing
# (c) Simon N. Wood (2013-2022). Provided under GPL 2.
## Routines for gam estimation beyond exponential family.
dDeta <- function(y,mu,wt,theta,fam,deriv=0) {
## What is available directly from the family are derivatives of the
## deviance and link w.r.t. mu. This routine converts these to the
## required derivatives of the deviance w.r.t. eta.
## deriv is the order of derivative of the smoothing parameter score
## required.
## This version is based on ratios of derivatives of links rather
## than raw derivatives of links. g2g = g''/g'^2, g3g = g'''/g'^3 etc
r <- fam$Dd(y, mu, theta, wt, level=deriv)
d <- list(Deta=0,Dth=0,Dth2=0,Deta2=0,EDeta2=0,Detath=0,
Deta3=0,Deta2th=0,Detath2=0,
Deta4=0,Deta3th=0,Deta2th2=0)
if (fam$link=="identity") { ## don't waste time on transformation
d$Deta <- r$Dmu;d$Deta2 <- r$Dmu2
d$EDeta2 <- r$EDmu2;d$Deta.Deta2 <- r$Dmu/r$Dmu2
d$Deta.EDeta2 <- r$Dmu/r$EDmu2
if (deriv>0) {
d$Dth <- r$Dth; d$Detath <- r$Dmuth
d$Deta3 <- r$Dmu3; d$Deta2th <- r$Dmu2th
d$EDeta2th <- r$EDmu2th;d$EDeta3 <- r$EDmu3
}
if (deriv>1) {
d$Deta4 <- r$Dmu4; d$Dth2 <- r$Dth2; d$Detath2 <- r$Dmuth2
d$Deta2th2 <- r$Dmu2th2; d$Deta3th <- r$Dmu3th
}
} else {
ig1 <- fam$mu.eta(fam$linkfun(mu))
ig12 <- ig1^2
g2g <- fam$g2g(mu)
d$Deta <- r$Dmu * ig1
d$Deta2 <- r$Dmu2*ig12 - r$Dmu*g2g*ig1
d$EDeta2 <- r$EDmu2*ig12
d$Deta.Deta2 <- r$Dmu/(r$Dmu2*ig1 - r$Dmu*g2g)
d$Deta.EDeta2 <- r$Dmu/(r$EDmu2*ig1)
if (deriv>0) {
ig13 <- ig12 * ig1
d$Dth <- r$Dth
d$Detath <- r$Dmuth * ig1
g3g <- fam$g3g(mu)
d$Deta3 <- r$Dmu3*ig13 - 3*r$Dmu2 * g2g * ig12 + r$Dmu * (3*g2g^2 - g3g)*ig1
if (!is.null(r$EDmu3)) d$EDeta3 <- r$EDmu3*ig13 - 3*r$EDmu2 * g2g * ig12 ## EDmu=0
d$Deta2th <- r$Dmu2th*ig12 - r$Dmuth*g2g*ig1
if (!is.null(r$EDmu2th)) d$EDeta2th <- r$EDmu2th*ig12 ##- r$EDmuth*g2g*ig1
}
if (deriv>1) {
g4g <- fam$g4g(mu)
d$Deta4 <- ig12^2*r$Dmu4 - 6*r$Dmu3*ig13*g2g + r$Dmu2*(15*g2g^2-4*g3g)*ig12 -
r$Dmu*(15*g2g^3-10*g2g*g3g +g4g)*ig1
d$Dth2 <- r$Dth2
d$Detath2 <- r$Dmuth2 * ig1
d$Deta2th2 <- ig12*r$Dmu2th2 - r$Dmuth2*g2g*ig1
d$Deta3th <- ig13*r$Dmu3th - 3 *r$Dmu2th*g2g*ig12 + r$Dmuth*(3*g2g^2-g3g)*ig1
}
} ## end of non identity
good <- is.finite(d$Deta)&is.finite(d$Deta2)
if (deriv>0) {
nth <- length(theta)
if (nth>1) good <- good&is.finite(rowSums(d$Dth))&is.finite(rowSums(d$Detath))&
is.finite(rowSums(d$Deta2th))&is.finite(d$Deta3) else
good <- good & is.finite(d$Deta3) & if (nth==0) TRUE else
is.finite(d$Dth)&is.finite(d$Detath)&is.finite(d$Deta2th)
if (deriv>1) {
if (nth>1) good <- good&is.finite(rowSums(d$Dth2))&is.finite(rowSums(d$Detath2))&is.finite(rowSums(d$Deta2th2))&
is.finite(rowSums(d$Deta3th))&is.finite(d$Deta4) else
good <- good & is.finite(d$Deta4) & if (nth==0) TRUE else
is.finite(d$Dth2)&is.finite(d$Detath2)&is.finite(d$Deta2th2)&is.finite(d$Deta3th)
}
}
d$good <- good
d
} ## dDeta
fetad.test <- function(y,mu,wt,theta,fam,eps = 1e-7,plot=TRUE) {
## test family derivatives w.r.t. eta
dd <- dDeta(y,mu,wt,theta,fam,deriv=2)
dev <- fam$dev.resids(y, mu, wt,theta)
mu1 <- fam$linkinv(fam$linkfun(mu)+eps)
dev1 <- fam$dev.resids(y,mu1, wt,theta)
Deta.fd <- (dev1-dev)/eps
cat("Deta: rdiff = ",range(dd$Deta-Deta.fd)," cor = ",cor(dd$Deta,Deta.fd),"\n")
plot(dd$Deta,Deta.fd);abline(0,1)
nt <- length(theta)
if (fam$n.theta && nt>0) for (i in 1:nt) {
th1 <- theta;th1[i] <- th1[i] + eps
dev1 <- fam$dev.resids(y, mu, wt,th1)
Dth.fd <- (dev1-dev)/eps
um <- if (nt>1) dd$Dth[,i] else dd$Dth
cat("Dth[",i,"]: rdiff = ",range(um-Dth.fd)," cor = ",cor(um,Dth.fd),"\n")
plot(um,Dth.fd);abline(0,1)
}
## second order up...
dd1 <- dDeta(y,mu1,wt,theta,fam,deriv=2)
Deta2.fd <- (dd1$Deta - dd$Deta)/eps
cat("Deta2: rdiff = ",range(dd$Deta2-Deta2.fd)," cor = ",cor(dd$Deta2,Deta2.fd),"\n")
plot(dd$Deta2,Deta2.fd);abline(0,1)
Deta3.fd <- (dd1$Deta2 - dd$Deta2)/eps
cat("Deta3: rdiff = ",range(dd$Deta3-Deta3.fd)," cor = ",cor(dd$Deta3,Deta3.fd),"\n")
plot(dd$Deta3,Deta3.fd);abline(0,1)
Deta4.fd <- (dd1$Deta3 - dd$Deta3)/eps
cat("Deta4: rdiff = ",range(dd$Deta4-Deta4.fd)," cor = ",cor(dd$Deta4,Deta4.fd),"\n")
plot(dd$Deta4,Deta4.fd);abline(0,1)
## and now the higher derivs wrt theta...
ind <- 1:nt
if (fam$n.theta && nt>0) for (i in 1:nt) {
th1 <- theta;th1[i] <- th1[i] + eps
dd1 <- dDeta(y,mu,wt,th1,fam,deriv=2)
Detath.fd <- (dd1$Deta - dd$Deta)/eps
um <- if (nt>1) dd$Detath[,i] else dd$Detath
cat("Detath[",i,"]: rdiff = ",range(um-Detath.fd)," cor = ",cor(um,Detath.fd),"\n")
plot(um,Detath.fd);abline(0,1)
Deta2th.fd <- (dd1$Deta2 - dd$Deta2)/eps
um <- if (nt>1) dd$Deta2th[,i] else dd$Deta2th
cat("Deta2th[",i,"]: rdiff = ",range(um-Deta2th.fd)," cor = ",cor(um,Deta2th.fd),"\n")
plot(um,Deta2th.fd);abline(0,1)
Deta3th.fd <- (dd1$Deta3 - dd$Deta3)/eps
um <- if (nt>1) dd$Deta3th[,i] else dd$Deta3th
cat("Deta3th[",i,"]: rdiff = ",range(um-Deta3th.fd)," cor = ",cor(um,Deta3th.fd),"\n")
plot(um,Deta3th.fd);abline(0,1)
## now the 3 second derivative w.r.t. theta terms
Dth2.fd <- (dd1$Dth - dd$Dth)/eps
um <- if (nt>1) dd$Dth2[,ind] else dd$Dth2
er <- if (nt>1) Dth2.fd[,i:nt] else Dth2.fd
cat("Dth2[",i,",]: rdiff = ",range(um-er)," cor = ",cor(as.numeric(um),as.numeric(er)),"\n")
plot(um,er);abline(0,1)
Detath2.fd <- (dd1$Detath - dd$Detath)/eps
um <- if (nt>1) dd$Detath2[,ind] else dd$Detath2
er <- if (nt>1) Detath2.fd[,i:nt] else Detath2.fd
cat("Detath2[",i,",]: rdiff = ",range(um-er)," cor = ",cor(as.numeric(um),as.numeric(er)),"\n")
## cat("Detath2[",i,",]: rdiff = ",range(dd$Detath2-Detath2.fd)," cor = ",cor(dd$Detath2,Detath2.fd),"\n")
plot(um,er);abline(0,1)
Deta2th2.fd <- (dd1$Deta2th - dd$Deta2th)/eps
um <- if (nt>1) dd$Deta2th2[,ind] else dd$Deta2th2
er <- if (nt>1) Deta2th2.fd[,i:nt] else Deta2th2.fd
cat("Deta2th2[",i,",]: rdiff = ",range(um-er)," cor = ",cor(as.numeric(um),as.numeric(er)),"\n")
## cat("Deta2th2[",i,",]: rdiff = ",range(dd$Deta2th2-Deta2th2.fd)," cor = ",cor(dd$Deta2th2,Deta2th2.fd),"\n")
ind <- max(ind)+1:(nt-i)
plot(um,er);abline(0,1)
}
} ## fetad.test
corb <- function(x,z) {
## alternative to cor for measuring similarity of x and z,
## which is not scaling invariant. So 1 really means x and z
## are very close, not just linearly related.
d <- x-z
1-mean(d^2)/(sd(x)*sd(z))
}
fmud.test <- function(y,mu,wt,theta,fam,eps = 1e-7,plot=TRUE) {
## test family deviance derivatives w.r.t. mu
## copy to make debugging easier...
Dd <- fam$Dd;dev.resids <- fam$dev.resids
dd <- Dd(y, mu, theta, wt, level=2)
dev <- dev.resids(y, mu, wt,theta)
dev1 <- dev.resids(y, mu+eps, wt,theta)
Dmu.fd <- (dev1-dev)/eps
cat("Dmu: rdiff = ",range(dd$Dmu-Dmu.fd)," cor = ",corb(dd$Dmu,Dmu.fd),"\n")
if (plot) {
pch <- 19;cex <- .4
plot(dd$Dmu,Dmu.fd,pch=pch,cex=cex);abline(0,1,col=2)
oask <- devAskNewPage(TRUE)
on.exit(devAskNewPage(oask))
}
nt <- length(theta)
if (fam$n.theta>0&&nt>0) for (i in 1:nt) {
th1 <- theta;th1[i] <- th1[i] + eps
dev1 <- dev.resids(y, mu, wt,th1)
Dth.fd <- (dev1-dev)/eps
um <- if (nt>1) dd$Dth[,i] else dd$Dth
cat("Dth[",i,"]: rdiff = ",range(um-Dth.fd)," cor = ",corb(um,Dth.fd),"\n",sep="")
if (plot) { plot(um,Dth.fd,pch=pch,cex=cex,xlab=paste("Dth[",i,"]",sep=""));
abline(0,1,col=2)}
}
## second order up...
dd1 <- Dd(y, mu+eps, theta, wt, level=2)
Dmu2.fd <- (dd1$Dmu - dd$Dmu)/eps
cat("Dmu2: rdiff = ",range(dd$Dmu2-Dmu2.fd)," cor = ",corb(dd$Dmu2,Dmu2.fd),"\n")
if (plot) { plot(dd$Dmu2,Dmu2.fd,pch=pch,cex=cex);abline(0,1,col=2)}
Dmu3.fd <- (dd1$Dmu2 - dd$Dmu2)/eps
cat("Dmu3: rdiff = ",range(dd$Dmu3-Dmu3.fd)," cor = ",corb(dd$Dmu3,Dmu3.fd),"\n")
if (plot) { plot(dd$Dmu3,Dmu3.fd,pch=pch,cex=cex);abline(0,1,col=2)}
Dmu4.fd <- (dd1$Dmu3 - dd$Dmu3)/eps
cat("Dmu4: rdiff = ",range(dd$Dmu4-Dmu4.fd)," cor = ",corb(dd$Dmu4,Dmu4.fd),"\n")
if (plot) { plot(dd$Dmu4,Dmu4.fd,pch=pch,cex=cex);abline(0,1,col=2)}
## and now the higher derivs wrt theta
ind <- 1:nt
if (fam$n.theta>0&&nt>0) for (i in 1:nt) {
th1 <- theta;th1[i] <- th1[i] + eps
dd1 <- Dd(y, mu, th1, wt, level=2)
Dmuth.fd <- (dd1$Dmu - dd$Dmu)/eps
um <- if (nt>1) dd$Dmuth[,i] else dd$Dmuth
cat("Dmuth[",i,"]: rdiff = ",range(um-Dmuth.fd)," cor = ",corb(um,Dmuth.fd),"\n")
if (plot) { plot(um,Dmuth.fd,pch=pch,cex=cex,xlab=paste("Dmuth[",i,"]",sep=""));abline(0,1,col=2)}
Dmu2th.fd <- (dd1$Dmu2 - dd$Dmu2)/eps
um <- if (nt>1) dd$Dmu2th[,i] else dd$Dmu2th
cat("Dmu2th[",i,"]: rdiff = ",range(um-Dmu2th.fd)," cor = ",corb(um,Dmu2th.fd),"\n")
if (plot) { plot(um,Dmu2th.fd,pch=pch,cex=cex,xlab=paste("Dmu2th[",i,"]",sep=""));abline(0,1,col=2)}
if (!is.null(dd$EDmu2th)) {
EDmu2th.fd <- (dd1$EDmu2 - dd$EDmu2)/eps
um <- if (nt>1) dd$EDmu2th[,i] else dd$EDmu2th
cat("EDmu2th[",i,"]: rdiff = ",range(um-EDmu2th.fd)," cor = ",corb(um,EDmu2th.fd),"\n")
if (plot) { plot(um,EDmu2th.fd,pch=pch,cex=cex,xlab=paste("EDmu2th[",i,"]",sep=""));abline(0,1,col=2)}
}
Dmu3th.fd <- (dd1$Dmu3 - dd$Dmu3)/eps
um <- if (nt>1) dd$Dmu3th[,i] else dd$Dmu3th
cat("Dmu3th[",i,"]: rdiff = ",range(um-Dmu3th.fd)," cor = ",corb(um,Dmu3th.fd),"\n")
if (plot) { plot(um,Dmu3th.fd,pch=pch,cex=cex,xlab=paste("Dmu3th[",i,"]",sep=""));abline(0,1,col=2)}
## now the 3 second derivative w.r.t. theta terms...
Dth2.fd <- (dd1$Dth - dd$Dth)/eps
um <- if (nt>1) dd$Dth2[,ind] else dd$Dth2
er <- if (nt>1) Dth2.fd[,i:nt] else Dth2.fd
cat("Dth2[",i,",]: rdiff = ",range(um-er)," cor = ",corb(as.numeric(um),as.numeric(er)),"\n")
if (plot) { plot(um,er,pch=pch,cex=cex,xlab=paste("Dth2[",i,",]",sep=""),ylab="fd");abline(0,1,col=2)}
Dmuth2.fd <- (dd1$Dmuth - dd$Dmuth)/eps
um <- if (nt>1) dd$Dmuth2[,ind] else dd$Dmuth2
er <- if (nt>1) Dmuth2.fd[,i:nt] else Dmuth2.fd
cat("Dmuth2[",i,",]: rdiff = ",range(um-er)," cor = ",corb(as.numeric(um),as.numeric(er)),"\n")
if (plot) { plot(um,er,pch=pch,cex=cex,xlab=paste("Dmuth2[",i,",]",sep=""),ylab="fd");abline(0,1,col=2)}
Dmu2th2.fd <- (dd1$Dmu2th - dd$Dmu2th)/eps
um <- if (nt>1) dd$Dmu2th2[,ind] else dd$Dmu2th2
er <- if (nt>1) Dmu2th2.fd[,i:nt] else Dmu2th2.fd
cat("Dmu2th2[",i,",]: rdiff = ",range(um-er)," cor = ",corb(as.numeric(um),as.numeric(er)),"\n")
if (plot) { plot(um,er,pch=pch,cex=cex,xlab=paste("Dmu2th2[",i,",]",sep=""),ylab="fd");abline(0,1,col=2)}
ind <- max(ind)+1:(nt-i)
}
} ## fmud.test
gam.fit4 <- function(x, y, sp, Eb,UrS=list(),
weights = rep(1, nobs), start = NULL, etastart = NULL,
mustart = NULL, offset = rep(0, nobs),U1=diag(ncol(x)), Mp=-1, family = gaussian(),
control = gam.control(), deriv=2,gamma=1,
scale=1,scoreType="REML",null.coef=rep(0,ncol(x)),nei=NULL,...) {
## Routine for fitting GAMs beyond exponential family.
## Inputs as gam.fit3 except that family is of class "extended.family", while
## sp contains the vector of extended family parameters, followed by the log smoothing parameters,
## followed by the log scale parameter if scale < 0
## some families have second derivative of deviance, and hence iterative weights
## very close to zero for some data. This can lead to poorly scaled sqrt(w)z
## and it is better to base everything on wz...
if (is.null(family$use.wz)) family$use.wz <- FALSE
warn <- list()
if (family$n.theta>0) { ## there are extra parameters to estimate
ind <- 1:family$n.theta
theta <- sp[ind] ## parameters of the family
family$putTheta(theta)
sp <- sp[-ind] ## log smoothing parameters
} else theta <- family$getTheta() ## fixed value
## penalized <- if (length(UrS)>0) TRUE else FALSE
if (scale>0) scale.known <- TRUE else {
## unknown scale parameter, trial value supplied as
## final element of sp.
scale.known <- FALSE
nsp <- length(sp)
scale <- exp(sp[nsp])
sp <- sp[-nsp]
}
x <- as.matrix(x)
nSp <- length(sp)
rank.tol <- .Machine$double.eps*100 ## tolerance to use for rank deficiency
q <- ncol(x)
n <- nobs <- nrow(x)
xnames <- dimnames(x)[[2]]
ynames <- if (is.matrix(y)) rownames(y) else names(y)
## Now a stable re-parameterization is needed....
if (length(UrS)) {
grderiv <- if (scoreType=="EFS") 1 else deriv
rp <- gam.reparam(UrS,sp,grderiv)
T <- diag(q)
T[1:ncol(rp$Qs),1:ncol(rp$Qs)] <- rp$Qs
T <- U1%*%T ## new params b'=T'b old params
null.coef <- t(T)%*%null.coef
if (!is.null(start)) start <- t(T)%*%start
## form x%*%T in parallel
x <- .Call(C_mgcv_pmmult2,x,T,0,0,control$nthreads)
rS <- list()
for (i in 1:length(UrS)) {
rS[[i]] <- rbind(rp$rS[[i]],matrix(0,Mp,ncol(rp$rS[[i]])))
} ## square roots of penalty matrices in current parameterization
Eb <- Eb%*%T ## balanced penalty matrix
rows.E <- q-Mp
Sr <- cbind(rp$E,matrix(0,nrow(rp$E),Mp))
St <- rbind(cbind(rp$S,matrix(0,nrow(rp$S),Mp)),matrix(0,Mp,q))
} else {
grderiv <- 0
T <- diag(q);
St <- matrix(0,q,q)
rSncol <- rows.E <- Eb <- Sr <- 0
rS <- list(0)
rp <- list(det=0,det1 = 0,det2 = 0,fixed.penalty=FALSE)
}
## re-parameterization complete. Initialization....
nvars <- ncol(x)
if (nvars==0) stop("emtpy models not available")
if (is.null(weights)) weights <- rep.int(1, nobs)
if (is.null(offset)) offset <- rep.int(0, nobs)
linkinv <- family$linkinv
valideta <- family$valideta
validmu <- family$validmu
dev.resids <- family$dev.resids
## need an initial `null deviance' to test for initial divergence...
## if (!is.null(start)) null.coef <- start - can be on edge of feasible - not good
null.eta <- as.numeric(x%*%null.coef + as.numeric(offset))
## call the families initialization code...
if (is.null(mustart)) {
eval(family$initialize)
mukeep <- NULL
} else {
mukeep <- mustart
eval(family$initialize)
#mustart <- mukeep
}
old.pdev <- sum(dev.resids(y, linkinv(null.eta), weights,theta)) + t(null.coef)%*%St%*%null.coef
if (!is.null(start)) { ## check it's at least better than null.coef
pdev <- sum(dev.resids(y, linkinv(x%*%start+as.numeric(offset)), weights,theta)) + t(start)%*%St%*%start
if (pdev>old.pdev) start <- mukeep <- etastart <- NULL
}
coefold <- null.coef ## set to default, may be replaced below
etaold <- x %*% coefold + offset
if (!is.null(mukeep)) mustart <- mukeep
## and now finalize initialization of mu and eta...
eta <- if (!is.null(etastart)) etastart
else if (!is.null(start))
if (length(start) != nvars)
stop("Length of start should equal ", nvars,
" and correspond to initial coefs for ", deparse(xnames))
else {
coefold <- start
etaold <- offset + as.vector(if (NCOL(x) == 1)
x * start
else x %*% start)
}
else family$linkfun(mustart)
mu <- linkinv(eta)
conv <- boundary <- FALSE
dd <- dDeta(y,mu,weights,theta,family,0) ## derivatives of deviance w.r.t. eta
w <- dd$Deta2 * .5
wz <- w*(eta-offset) - .5*dd$Deta
z <- (eta-offset) - dd$Deta.Deta2
good <- is.finite(z)&is.finite(w)
zg <- rep(0,max(dim(x)))
for (iter in 1:control$maxit) { ## start of main fitting iteration
if (control$trace) cat(iter," ")
if (control$trace&sum(!good)>0) cat("\n",sum(!good)," not good\n")
if (sum(!good)) {
use.wy <- TRUE
good <- is.finite(w)&is.finite(wz)
z[!is.finite(z)] <- 0 ## avoid NaN in .C call - unused anyway
} else use.wy <- family$use.wz
if (sum(good)==0) stop("no good data in iteration")
ng <- sum(good)
zg[1:ng] <- z[good] ## ensure that y dimension large enough for coefs
oo <- .C(C_pls_fit1,
y=as.double(zg),X=as.double(x[good,]),w=as.double(w[good]),wy = as.double(wz[good]),
E=as.double(Sr),Es=as.double(Eb),n=as.integer(ng),
q=as.integer(ncol(x)),rE=as.integer(rows.E),eta=as.double(z),
penalty=as.double(1),rank.tol=as.double(rank.tol),
nt=as.integer(control$nthreads),use.wy=as.integer(use.wy))
posdef <- oo$n >= 0
if (!posdef) { ## then problem is indefinite - switch to +ve weights for this step
if (control$trace) cat("**using positive weights\n")
# problem is that Fisher can be very poor for zeroes
## index weights that are finite and positive
good <- is.finite(dd$Deta2)
good[good] <- dd$Deta2[good]>0
w[!good] <- 0
wz <- w*(eta-offset) - .5*dd$Deta
z <- (eta-offset) - dd$Deta.Deta2
good <- is.finite(z)&is.finite(w)
if (sum(!good)) {
use.wy <- TRUE
good <- is.finite(w)&is.finite(wz)
z[!is.finite(z)] <- 0 ## avoid NaN in .C call - unused anyway
} else use.wy <- family$use.wz
ng <- sum(good)
zg[1:ng] <- z[good] ## ensure that y dimension large enough for coefs
oo <- .C(C_pls_fit1, ##C_pls_fit1,
y=as.double(zg),X=as.double(x[good,]),w=as.double(w[good]),wy = as.double(wz[good]),
E=as.double(Sr),Es=as.double(Eb),n=as.integer(ng),
q=as.integer(ncol(x)),rE=as.integer(rows.E),eta=as.double(z),
penalty=as.double(1),rank.tol=as.double(rank.tol),
nt=as.integer(control$nthreads),use.wy=as.integer(use.wy))
}
start <- oo$y[1:ncol(x)] ## current coefficient estimates
penalty <- oo$penalty ## size of penalty
eta <- drop(x%*%start) ## the linear predictor (less offset)
if (any(!is.finite(start))) { ## test for breakdown
conv <- FALSE
warn[[length(warn)+1]] <- paste(" gam.fit4 non-finite coefficients at iteration ",
iter)
return(list(REML=NA,warn=warn)) ## return immediately signalling failure
}
mu <- linkinv(eta <- eta + offset)
dev <- sum(dev.resids(y, mu, weights,theta))
## now step halve under non-finite deviance...
if (!is.finite(dev)) {
if (is.null(coefold)) {
if (is.null(null.coef))
stop("no valid set of coefficients has been found:please supply starting values",
call. = FALSE)
## Try to find feasible coefficients from the null.coef and null.eta
coefold <- null.coef
etaold <- null.eta
}
ii <- 1
while (!is.finite(dev)) {
if (ii > control$maxit)
stop("inner loop 1; can't correct step size")
ii <- ii + 1
start <- (start + coefold)/2
eta <- (eta + etaold)/2
mu <- linkinv(eta)
dev <- sum(dev.resids(y, mu, weights,theta))
}
boundary <- TRUE
penalty <- t(start)%*%St%*%start ## reset penalty too
if (control$trace)
cat("Step halved: new deviance =", dev, "\n")
} ## end of infinite deviance correction
## now step halve if mu or eta are out of bounds...
if (!(valideta(eta) && validmu(mu))) {
ii <- 1
while (!(valideta(eta) && validmu(mu))) {
if (ii > control$maxit)
stop("inner loop 2; can't correct step size")
ii <- ii + 1
start <- (start + coefold)/2
eta <- (eta + etaold)/2
mu <- linkinv(eta)
}
boundary <- TRUE
dev <- sum(dev.resids(y, mu, weights,theta))
penalty <- t(start)%*%St%*%start ## need to reset penalty too
if (control$trace)
cat("Step halved: new deviance =", dev, "\n")
} ## end of invalid mu/eta handling
## now check for divergence of penalized deviance....
pdev <- dev + penalty ## the penalized deviance
if (control$trace) cat("penalized deviance =", pdev, "\n")
div.thresh <- 10*(.1+abs(old.pdev))*.Machine$double.eps^.5
if (pdev-old.pdev>div.thresh) { ## solution diverging
ii <- 1 ## step halving counter
if (iter==1) { ## immediate divergence, need to shrink towards zero
etaold <- null.eta; coefold <- null.coef
}
while (pdev -old.pdev > div.thresh) { ## step halve until pdev <= old.pdev
if (ii > 100)
stop("inner loop 3; can't correct step size")
ii <- ii + 1
start <- (start + coefold)/2
eta <- (eta + etaold)/2
mu <- linkinv(eta)
dev <- sum(dev.resids(y, mu, weights,theta))
penalty <- t(start)%*%St%*%start
pdev <- dev + penalty ## the penalized deviance
if (control$trace)
cat("Step halved: new penalized deviance =", pdev, "\n")
}
} ## end of pdev divergence
if (scoreType=="EFS"&&family$n.theta>0) { ## there are theta parameters to estimate...
scale1 <- if (!is.null(family$scale)) family$scale else scale
if (family$n.theta>0||scale<0) theta <- estimate.theta(theta,family,y,mu,scale=scale1,wt=weights,tol=1e-7)
if (!is.null(family$scale) && family$scale<0) {
scale <- exp(theta[family$n.theta+1])
theta <- theta[1:family$n.theta]
}
family$putTheta(theta)
}
## get new weights and pseudodata (needed now for grad testing)...
dd <- dDeta(y,mu,weights,theta,family,0) ## derivatives of deviance w.r.t. eta
w <- dd$Deta2 * .5;
wz <- w*(eta-offset) - .5*dd$Deta
z <- (eta-offset) - dd$Deta.Deta2
good <- is.finite(z)&is.finite(w)
## convergence testing...
if (posdef && abs(pdev - old.pdev)/(0.1 + abs(pdev)) < control$epsilon) {
## Need to check coefs converged adequately, to ensure implicit differentiation
## ok. Testing coefs unchanged is problematic under rank deficiency (not guaranteed to
## drop same parameter every iteration!)
grad <- 2 * t(x[good,,drop=FALSE])%*%((w[good]*(x%*%start)[good]-wz[good]))+ 2*St%*%start
if (max(abs(grad)) > control$epsilon*(abs(pdev)+scale)) {
old.pdev <- pdev ## not converged quite enough
coef <- coefold <- start
etaold <- eta
##muold <- mu
} else { ## converged
conv <- TRUE
coef <- start
break
}
} else { ## not converged
old.pdev <- pdev
coef <- coefold <- start
etaold <- eta
}
if (scoreType=="EFS"&&family$n.theta>0) {
## now recompute pdev with new theta, otherwise step control won't work at next iteration
dev <- sum(dev.resids(y, mu, weights,theta))
old.pdev <- pdev <- dev + penalty
}
} ## end of main loop
## so at this stage the model has been fully estimated
coef <- as.numeric(T %*% coef)
## now obtain derivatives, if these are needed...
check.derivs <- FALSE
while (check.derivs) { ## debugging code to check derivatives
eps <- 1e-7
fmud.test(y,mu,weights,theta,family,eps = eps)
fetad.test(y,mu,weights,theta,family,eps = eps)
}
dd <- dDeta(y,mu,weights,theta,family,deriv)
w <- dd$Deta2 * .5
z <- (eta-offset) - dd$Deta.Deta2 ## - .5 * dd$Deta[good] / w
wf <- pmax(0,dd$EDeta2 * .5) ## Fisher type weights
wz <- w*(eta-offset) - 0.5*dd$Deta ## Wz finite when w==0
good <- is.finite(wz)&is.finite(w)&dd$good
if (sum(good)==0) stop("not enough finite derivatives")
residuals <- z - (eta - offset)
residuals[!is.finite(residuals)] <- NA
z[!is.finite(z)] <- 0 ## avoid passing NA etc to C code
ntot <- length(theta) + length(sp)
rSncol <- unlist(lapply(UrS,ncol))
## Now drop any elements of dd that have been dropped in fitting...
if (sum(!good)>0) { ## drop !good from fields of dd, weights and pseudodata
z <- z[good]; w <- w[good]; wz <- wz[good]; wf <- wf[good]
dd$Deta <- dd$Deta[good];dd$Deta2 <- dd$Deta2[good]
dd$EDeta2 <- dd$EDeta2[good]
if (deriv>0) dd$Deta3 <- dd$Deta3[good]
if (deriv>1) dd$Deta4 <- dd$Deta4[good]
if (length(theta)>1) {
if (deriv>0) {
dd$Dth <- dd$Dth[good,];
dd$Detath <- dd$Detath[good,]; dd$Deta2th <- dd$Deta2th[good,]
if (deriv>1) {
dd$Detath2 <- dd$Detath2[good,]; dd$Deta3th <- dd$Deta3th[good,]
dd$Deta2th2 <- dd$Deta2th2[good,];dd$Dth2 <- dd$Dth2[good,]
}
}
} else {
if (deriv>0) {
dd$Dth <- dd$Dth[good];
dd$Detath <- dd$Detath[good]; dd$Deta2th <- dd$Deta2th[good]
if (deriv>1) {
dd$Detath2 <- dd$Detath2[good]; dd$Deta3th <- dd$Deta3th[good]
dd$Deta2th2 <- dd$Deta2th2[good]; dd$Dth2 <- dd$Dth2[good]
}
}
}
}
## gdi.type should probably be computed after dropping via good
gdi.type <- if (any(abs(w)<.Machine$double.xmin*1e20)||any(!is.finite(z))) 1 else 0
if (scoreType=="EFS") scoreType <- "REML"
oo <- .C(C_gdi2,
X=as.double(x[good,]),E=as.double(Sr),Es=as.double(Eb),rS=as.double(unlist(rS)),
U1 = as.double(U1),sp=as.double(exp(sp)),theta=as.double(theta),
z=as.double(z),w=as.double(w),wz=as.double(wz),wf=as.double(wf),Dth=as.double(dd$Dth),
Det=as.double(dd$Deta),
Det2=as.double(dd$Deta2),Dth2=as.double(dd$Dth2),Det.th=as.double(dd$Detath),
Det2.th=as.double(dd$Deta2th),Det3=as.double(dd$Deta3),Det.th2 = as.double(dd$Detath2),
Det4 = as.double(dd$Deta4),Det3.th=as.double(dd$Deta3th), Deta2.th2=as.double(dd$Deta2th2),
beta=as.double(coef),b1=as.double(rep(0,ntot*ncol(x))),w1=as.double(rep(0,ntot*length(z))),
D1=as.double(rep(0,ntot)),D2=as.double(rep(0,ntot^2)),
P=as.double(0),P1=as.double(rep(0,ntot)),P2 = as.double(rep(0,ntot^2)),
ldet=as.double(1-2*(scoreType=="ML")),ldet1 = as.double(rep(0,ntot)),
ldet2 = as.double(rep(0,ntot^2)),
rV=as.double(rep(0,ncol(x)^2)),
rank.tol=as.double(.Machine$double.eps^.75),rank.est=as.integer(0),
n=as.integer(sum(good)),q=as.integer(ncol(x)),M=as.integer(nSp),
n.theta=as.integer(length(theta)), Mp=as.integer(Mp),Enrow=as.integer(rows.E),
rSncol=as.integer(rSncol),deriv=as.integer(deriv),
fixed.penalty = as.integer(rp$fixed.penalty),nt=as.integer(control$nthreads),
type=as.integer(gdi.type),dVkk=as.double(rep(0,nSp^2)))
rV <- matrix(oo$rV,ncol(x),ncol(x)) ## rV%*%t(rV)*scale gives covariance matrix
#rV <- rV # transform before return
## derivatives of coefs w.r.t. sps etc...
## note that db.drho and dw.drho start with derivs wrt theta, then wrt sp (no scale param of course)
db.drho <- if (deriv) matrix(oo$b1,ncol(x),ntot) else NULL ## transform before return
dw.drho <- if (deriv) matrix(oo$w1,length(z),ntot) else NULL
Kmat <- matrix(0,nrow(x),ncol(x))
Kmat[good,] <- oo$X ## rV%*%t(K)%*%(sqrt(wf)*X) = F; diag(F) is edf array
Vg <- NCV <- NCV1 <- REML <- REML1 <- REML2 <- NULL
if (scoreType=="NCV") {
eta.cv <- rep(0.0,length(nei$i))
deta.cv <- if (deriv) matrix(0.0,length(nei$i),ntot) else matrix(0.0,1,ntot)
w1 <- -dd$Deta/2; w2 <- dd$Deta2/2; dth <- dd$Detath/2 ## !?
R <- try(chol(crossprod(x,w*x)+St),silent=TRUE)
if (nei$jackknife > 2) { ## return NCV coef changes for each fold
if (deriv>0) stop("jackknife and derivatives requested together")
dth <- matrix(0,ncol(x),length(nei$m))
deriv1 <- -1 ## signal to return coef changes
} else deriv1 <- deriv
if (inherits(R,"try-error")) { ## use CG approach...
Hi <- tcrossprod(rV) ## inverse of penalized Expected Hessian - inverse actual Hessian probably better
cg.iter <- .Call(C_ncv,x,Hi,w1,w2,db.drho,dw.drho,rS,nei$i-1,nei$mi,nei$m,nei$k-1,oo$beta,exp(sp),eta.cv, deta.cv, dth, deriv1);
warn[[length(warn)+1]] <- "NCV positive definite update check not possible"
} else { ## use Cholesky update approach
pdef.fails <- .Call(C_Rncv,x,R,w1,w2,db.drho,dw.drho,rS,nei$i-1,nei$mi,nei$m,nei$k-1,oo$beta,exp(sp),eta.cv,
deta.cv, dth, deriv1,.Machine$double.eps,control$ncv.threads);
if (pdef.fails) warn[[length(warn)+1]] <- "some NCV updates not positive definite"
}
mu.cv <- linkinv(eta.cv)
nt <- family$n.theta
## if some elements of theta are fixed, figure out which derivatives will be needed...
if (deriv) keep <- if (length(theta)>nt) (length(theta)+1):(ncol(db.drho)+!scale.known) else 1:(ncol(db.drho)+!scale.known)
dev0 <- dev.resids(y[nei$i], mu[nei$i], weights[nei$i],theta)
ls0 <- family$ls(y[nei$i],weights[nei$i],theta,scale)
if (family$qapprox) { ## quadratic approximation to NCV
qdev <- dev0 + gamma*dd$Deta[nei$i]*(eta.cv-eta[nei$i]) + 0.5*gamma*dd$Deta2[nei$i]*(eta.cv-eta[nei$i])^2
NCV <- sum(qdev)/(2*scale) - ls0$ls
if (deriv) {
deta <- x %*% db.drho
ncv1 <- (dd$Deta[nei$i]*((1-gamma)*deta[nei$i,,drop=FALSE]+gamma*deta.cv) +
gamma*dd$Deta2[nei$i]*deta.cv*(eta.cv-eta[nei$i]) +
0.5*gamma*as.numeric(dd$Deta3[nei$i])*deta[nei$i,,drop=FALSE]*(eta.cv-eta[nei$i])^2)/(2*scale)
if (nt>0) { ## deal with direct dependence on the theta parameters
ncv1[,1:nt] <- ncv1[,1:nt]- ls0$LSTH1[,1:nt] +
if (nt==1) (dd$Dth[nei$i] + gamma*dd$Detath[nei$i]*(eta.cv-eta[nei$i]) + 0.5*gamma*dd$Deta2th[nei$i]*(eta.cv-eta[nei$i])^2)/(2*scale)
else (dd$Dth[nei$i,] + gamma*dd$Detath[nei$i,]*(eta.cv-eta[nei$i]) + 0.5*gamma*dd$Deta2th[nei$i,]*(eta.cv-eta[nei$i])^2)/(2*scale)
}
if (!scale.known) {
ncv1 <- cbind(ncv1,-qdev/(2*scale) - ls0$LSTH1[,1+nt])
}
}
} else { ## exact NCV
dev.cv <- dev.resids(y, mu.cv, weights,theta)
NCV <- sum(dev.cv)/(2*scale) - ls0$ls
DEV <- sum(dev0)/(2*scale) - ls0$ls
if (gamma!=1) NCV <- gamma*NCV - (gamma-1)*DEV
if (deriv) {
dd.cv <- dDeta(y[nei$i],mu.cv,weights[nei$i],theta,family,1)
ncv1 <- dd.cv$Deta*deta.cv/(2*scale)
if (gamma!=1) dev1 <- (dd$Deta*(x%*%db.drho))[nei$i,,drop=FALSE]/(2*scale)
if (nt>0) {
ncv1[,1:nt] <- ncv1[,1:nt] + as.matrix(dd.cv$Dth/(2*scale)) - ls0$LSTH1[,1:nt]
if (gamma!=1) dev1[,1:nt] <- dev1[,1:nt] + as.matrix(dd$Dth/(2*scale))[nei$i,,drop=FALSE] - ls0$LSTH1[,1:nt]
}
if (!scale.known) { ## deal with log scale parameter derivative
ncv1 <- cbind(ncv1,-dev.cv/(2*scale) - ls0$LSTH1[,1+nt])
if (gamma!=1) dev1 <- cbind(dev1,-dev0/(2*scale) - ls0$lSTH1[,1+nt])
}
if (gamma!=1) ncv1 <- gamma*ncv1 - (gamma-1)*dev1
}
} ## exact NCV
if (nei$jackknife>2) {
nk <- c(nei$m[1],diff(nei$m)) ## dropped fold sizes
jkw <- sqrt((nobs-nk)/(nobs*nk)) ## jackknife weights
dth <-jkw*t(dth)%*%t(T)
#Vj <- crossprod(dd) ## jackknife cov matrix for coefs (beta)
#attr(Vj,"dd") <- dd
#attr(NCV,"Vj") <- Vj
attr(NCV,"dd") <- dth
}
attr(NCV,"eta.cv") <- eta.cv
if (deriv) {
attr(NCV,"deta.cv") <- deta.cv;
NCV1 <- colSums(ncv1)
NCV1 <- NCV1[keep] ## drop derivatives for any fixed theta parameters
Vg <- crossprod(ncv1[,keep,drop=FALSE]) ## empirical cov matrix of grad
}
} else { ## RE/ML
D2 <- matrix(oo$D2,ntot,ntot); ldet2 <- matrix(oo$ldet2,ntot,ntot)
bSb2 <- matrix(oo$P2,ntot,ntot)
## compute the REML score...
ls <- family$ls(y,weights,theta,scale)
nt <- length(theta)
lsth1 <- if (nt>0) ls$lsth1[1:nt] else rep(0,0)
lsth2 <- if (nt>0) as.matrix(ls$lsth2)[1:nt,1:nt] else matrix(0,0,0) ## exclude any derivs w.r.t log scale here
REML <- ((dev+oo$P)/(2*scale) - ls$ls)/gamma + (oo$ldet - rp$det)/2 -
as.numeric(scoreType=="REML") * Mp * (log(2*pi*scale)/2-log(gamma)/2)
if (deriv) {
det1 <- oo$ldet1
if (nSp) {
ind <- 1:nSp + length(theta)
det1[ind] <- det1[ind] - rp$det1
}
REML1 <- ((oo$D1+oo$P1)/(2*scale) - c(lsth1,rep(0,length(sp))))/gamma + (det1)/2
if (deriv>1) {
ls2 <- D2*0;if (nt>0) ls2[1:nt,1:nt] <- lsth2
if (nSp) ldet2[ind,ind] <- ldet2[ind,ind] - rp$det2
REML2 <- ((D2+bSb2)/(2*scale) - ls2)/gamma + ldet2/2
}
}
if (!scale.known&&deriv) { ## need derivatives wrt log scale, too
Dp <- dev + oo$P
dlr.dlphi <- (-Dp/(2 *scale) - ls$lsth1[nt+1])/gamma - as.numeric(scoreType=="REML") * Mp/2
d2lr.d2lphi <- (Dp/(2*scale) - ls$lsth2[nt+1,nt+1])/gamma
d2lr.dspphi <- -(oo$D1+oo$P1)/(2*scale*gamma)
if (nt>0) d2lr.dspphi[1:nt] <- d2lr.dspphi[1:nt] - ls$lsth2[nt+1,1:nt]/gamma
REML1 <- c(REML1,dlr.dlphi)
if (deriv==2) {
REML2 <- rbind(REML2,as.numeric(d2lr.dspphi))
REML2 <- cbind(REML2,c(as.numeric(d2lr.dspphi),d2lr.d2lphi))
}
}
}
nth <- length(theta)
if (deriv) db.drho <- T %*% db.drho
if (deriv>0&&family$n.theta==0&&nth>0) { ## need to drop derivs for fixed theta
REML1 <- REML1[-(1:nth)]
if (deriv>1) REML2 <- REML2[-(1:nth),-(1:nth)]
db.drho <- db.drho[,-(1:nth),drop=FALSE]
}
names(coef) <- xnames
names(residuals) <- ynames
wtdmu <- sum(weights * y)/sum(weights) ## has to then be corrected when this is incorrect
## wtdmu <- sum(weights * mu)/sum(weights) ## changed from y
nulldev <- sum(dev.resids(y, rep(wtdmu,length(y)), weights)) ## this will be corrected in family postproc
n.ok <- nobs - sum(weights == 0)
nulldf <- n.ok
ww <- wt <- rep.int(0, nobs)
wt[good] <- wf
ww[good] <- w
if (deriv && nrow(dw.drho)!=nrow(x)) {
w1 <- dw.drho
dw.drho <- matrix(0,nrow(x),ncol(w1))
dw.drho[good,] <- w1
}
## dev is used in family$aic to estimate the scale as dev/sum(weights), or is unused.
## the form is for compatibility with exponential families, but the estimator can
## be biased for low count data in the extended family case. So better to force the
## actual scale estimate to be used...
aic.model <- family$aic(y, mu, theta, weights, dev=scale*sum(weights)) # note: incomplete 2*edf needs to be added
list(coefficients = coef,residuals=residuals,fitted.values = mu,
family=family, linear.predictors = eta,deviance=dev,
null.deviance=nulldev,iter=iter,
weights=wt, ## note that these are Fisher type weights
prior.weights=weights,
working.weights = ww, ## working weights
df.null = nulldf, y = y, converged = conv,z=z,
boundary = boundary,
REML=REML,REML1=REML1,REML2=REML2,NCV=NCV,NCV1=NCV1,
rV=T %*% rV,db.drho=db.drho,dw.drho=dw.drho,
scale.est=scale,reml.scale=scale,
aic=aic.model,
rank=oo$rank.est,
K=Kmat,control=control,Vg=Vg,
dVkk = matrix(oo$dVkk,nSp,nSp),ldetS1 = if (grderiv) rp$det1 else 0
#,D1=oo$D1,D2=D2,
#ldet=oo$ldet,ldet1=oo$ldet1,ldet2=ldet2,
#bSb=oo$P,bSb1=oo$P1,bSb2=bSb2,
#ls=ls$ls,ls1=ls$lsth1,ls2=ls$lsth2
)
} ## gam.fit4
efsudr <- function(x,y,lsp,Eb,UrS,weights,family,offset=0,start=NULL,etastart=NULL,mustart=NULL,
U1=diag(ncol(x)), intercept = TRUE,scale=1,Mp=-1,control=gam.control(),n.true=-1,...) {
## Extended Fellner-Schall method for regular and extended families,
## with PIRLS performed by gam.fit3/4.
## tr(S^-S_j) is returned by ldetS1, rV %*% t(rV)*scale is
## cov matrix. I think b'S_jb will need to be computed here.
nsp <- length(UrS)
if (inherits(family,"extended.family")) {
spind <- family$n.theta + 1:nsp
thind <- if (family$n.theta>0) 1:family$n.theta else rep(0,0)
} else {
thind <- rep(0,0)
spind <- 1:nsp ## index of smoothing params in lsp
}
estimate.scale <- (length(lsp)>max(spind))
lsp[spind] <- lsp[spind] + 2.5
mult <- 1
fit <- gam.fit3(x=x, y=y, sp=lsp, Eb=Eb,UrS=UrS,
weights = weights, start = start, offset = offset,U1=U1, Mp=Mp, family = family,
control = control, intercept = intercept,deriv=0,
gamma=1,scale=scale,scoreType="EFS",
n.true=n.true,...)
if (length(thind)>0) lsp[thind] <- family$getTheta()
if (estimate.scale) lsp[length(lsp)] <- log(fit$scale)
## Also need scale estimate. OK from gam.fit3, but gam.fit4 version probably needs correcting
## for edf, as obtained via MLE.
p <- ncol(x)
n <- nrow(x)
score.hist <- rep(0,200)
bSb <- trVS <- rep(0,nsp)
for (iter in 1:200) {
start <- fit$coefficients
Y <- U1[,1:(ncol(U1)-Mp)] ## penalty range space
## project coefs and rV to Y, since this is space of UrS[[i]]
Yb <- drop(t(Y)%*%start)
rV <- t(fit$rV) ## so t(rV)%*%rV*scale is cov matrix
rVY <- rV %*% Y
## ith penalty is UrS[[i]]%*%t(UrS[[i]])...
for (i in 1:length(UrS)) {
xx <- Yb %*% UrS[[i]]
bSb[i] <- sum(xx^2)
xx <- rVY %*% UrS[[i]]
trVS[i] <- sum(xx^2)
}
edf <- p - sum(trVS*exp(lsp[spind]))
if (inherits(family,"extended.family")&&estimate.scale) {
fit$scale <- fit$scale*n/(n-edf) ## correct for edf.
}
a <- pmax(0,fit$ldetS1*exp(-lsp[spind]) - trVS) ## NOTE: double check scaling here
phi <- if (estimate.scale) fit$scale else scale
r <- a/pmax(0,bSb)*phi
r[a==0&bSb==0] <- 1
r[!is.finite(r)] <- 1e6
lsp1 <- lsp
lsp1[spind] <- pmin(lsp[spind] + log(r)*mult,control$efs.lspmax)
max.step <- max(abs(lsp1-lsp))
old.reml <- fit$REML
fit <- gam.fit3(x=x, y=y, sp=lsp1, Eb=Eb,UrS=UrS,
weights = weights, start = start, offset = offset,U1=U1, Mp=Mp, family = family,
control = control, intercept = intercept,deriv=0,mustart=mustart,
gamma=1,scale=scale,scoreType="EFS",
n.true=n.true,...)
if (length(thind)>0) lsp1[thind] <- family$getTheta()
if (estimate.scale) lsp1[length(lsp)] <- log(fit$scale)
## some step length control...
if (fit$REML<=old.reml) { ## improvement
if (max.step<.05) { ## consider step extension (near optimum)
lsp2 <- lsp
lsp2[spind] <- pmin(lsp[spind] + log(r)*mult*2,control$efs.lspmax) ## try extending step...
fit2 <- gam.fit3(x=x, y=y, sp=lsp2, Eb=Eb,UrS=UrS,
weights = weights, start = start, offset = offset,U1=U1, Mp=Mp, family = family,
control = control, intercept = intercept,deriv=0,mustart=mustart,
gamma=1,scale=scale,scoreType="EFS",
n.true=n.true,...)
if (length(thind)>0) lsp2[thind] <- family$getTheta()
if (estimate.scale) lsp2[length(lsp)] <- log(fit$scale)
if (fit2$REML < fit$REML) { ## improvement - accept extension
fit <- fit2;lsp <- lsp2
mult <- mult * 2
} else { ## accept old step
lsp <- lsp1
}
} else lsp <- lsp1
} else { ## no improvement
while (fit$REML > old.reml&&mult>1) { ## don't contract below 1 as update doesn't have to improve REML
mult <- mult/2 ## contract step
lsp1 <- lsp
lsp1[spind] <- pmin(lsp[spind] + log(r)*mult,control$efs.lspmax)
fit <- gam.fit3(x=x, y=y, sp=lsp1, Eb=Eb,UrS=UrS,
weights = weights, start = start, offset = offset,U1=U1, Mp=Mp, family = family,
control = control, intercept = intercept,deriv=0,mustart=mustart,
gamma=1,scale=scale,scoreType="EFS",
n.true=n.true,...)
if (length(thind)>0) lsp1[thind] <- family$getTheta()
if (estimate.scale) lsp1[length(lsp)] <- log(fit$scale)
}
lsp <- lsp1
if (mult<1) mult <- 1
}
score.hist[iter] <- fit$REML
## break if EFS step small and REML change negligible over last 3 steps.
if (iter>3 && max.step<.05 && max(abs(diff(score.hist[(iter-3):iter])))<control$efs.tol) break
## or break if deviance not changing...
if (iter==1) old.dev <- fit$dev else {
if (abs(old.dev-fit$dev) < 100*control$eps*abs(fit$dev)) break
old.dev <- fit$dev
}
}
fit$sp <- exp(lsp)
fit$iter <- iter
fit$outer.info <- list(iter = iter,score.hist=score.hist[1:iter])
fit$outer.info$conv <- if (iter==200) "iteration limit reached" else "full convergence"
fit
} ## efsudr
gam.fit5 <- function(x,y,lsp,Sl,weights=NULL,offset=NULL,deriv=2,family,scoreType="REML",
control=gam.control(),Mp=-1,start=NULL,gamma=1,nei=NULL){
## NOTE: offset handling - needs to be passed to ll code
## fit models by general penalized likelihood method,
## given doubly extended family in family. lsp is log smoothing parameters
## Stabilization strategy:
## 1. Sl.repara
## 2. Hessian diagonally pre-conditioned if +ve diagonal elements
## (otherwise indefinite anyway)
## 3. Newton fit with perturbation of any indefinite hessian
## 4. At convergence test fundamental rank on balanced version of
## penalized Hessian. Drop unidentifiable parameters and
## continue iteration to adjust others.
## 5. All remaining computations in reduced space.
##
## Idea is that rank detection takes care of structural co-linearity,
## while preconditioning and step 1 take care of extreme smoothing parameters
## related problems.
penalized <- if (length(Sl)>0) TRUE else FALSE
warn <- list()
nSp <- length(lsp)
q <- ncol(x)
nobs <- length(y)
if (penalized) {
Eb <- attr(Sl,"E") ## balanced penalty sqrt
## the stability reparameterization + log|S|_+ and derivs...
rp <- ldetS(Sl,rho=lsp,fixed=rep(FALSE,length(lsp)),np=q,root=TRUE)
x <- Sl.repara(rp$rp,x) ## apply re-parameterization to x
Eb <- Sl.repara(rp$rp,Eb) ## root balanced penalty
St <- crossprod(rp$E) ## total penalty matrix
E <- rp$E ## root total penalty
attr(E,"use.unscaled") <- TRUE ## signal initialization code that E not to be further scaled
if (!is.null(start)) start <- Sl.repara(rp$rp,start) ## re-para start
## NOTE: it can be that other attributes need re-parameterization here
## this should be done in 'family$initialize' - see mvn for an example.
} else { ## unpenalized so no derivatives required
deriv <- 0
rp <- list(ldetS=0,rp=list())
St <- matrix(0,q,q)
E <- matrix(0,0,q) ## can be needed by initialization code
}
## now call initialization code, but make sure that any
## supplied 'start' vector is not overwritten...
start0 <- start
## Assumption here is that the initialization code is fine with
## re-parameterized x...
eval(family$initialize)
if (!is.null(start0)) start <- start0
coef <- as.numeric(start)
if (is.null(weights)) weights <- rep.int(1, nobs)
if (is.null(offset)) offset <- rep.int(0, nobs)
## get log likelihood, grad and Hessian (w.r.t. coefs - not s.p.s) ...
llf <- family$ll
ll <- llf(y,x,coef,weights,family,offset=offset,deriv=1)
ll0 <- ll$l - drop(t(coef)%*%St%*%coef)/2
grad <- ll$lb - St%*%coef
iconv <- max(abs(grad))<control$epsilon*abs(ll0)
Hp <- -ll$lbb+St
rank.checked <- FALSE ## not yet checked the intrinsic rank of problem
rank <- q
eigen.fix <- FALSE
converged <- FALSE
check.deriv <- FALSE; eps <- 1e-5
drop <- NULL;bdrop <- rep(FALSE,q) ## by default nothing dropped
perturbed <- 0 ## counter for number of times perturbation tried on possible saddle
for (iter in 1:(2*control$maxit)) { ## main iteration
## get Newton step...
if (check.deriv) { ## code for checking derivatives when debugging
fdg <- ll$lb*0; fdh <- ll$lbb*0
for (k in 1:length(coef)) {
coef1 <- coef;coef1[k] <- coef[k] + eps
ll.fd <- llf(y,x,coef1,weights,family,offset=offset,deriv=1)
fdg[k] <- (ll.fd$l-ll$l)/eps
fdh[,k] <- (ll.fd$lb-ll$lb)/eps
}
} ## derivative checking end
#grad <- ll$lb - St%*%coef
#Hp <- -ll$lbb+St
kappaH <- kappa(Hp)
D <- diag(Hp)
if (sum(!is.finite(D))>0) stop("non finite values in Hessian")
if (min(D)<=0) { ## 2/2/19 replaces any D<0 indicating indef
Dthresh <- max(D)*sqrt(.Machine$double.eps)
if (-min(D) < Dthresh) { ## could be indef or +ve semi def
indefinite <- FALSE
D[D<Dthresh] <- Dthresh
} else indefinite <- TRUE ## min D too negative to be semi def
} else indefinite <- FALSE ## On basis of D could be +ve def
if (indefinite) { ## Hessian indefinite, for sure
## D <- rep(1,ncol(Hp)) # moved to later otherwise Ip/Ib pointless 2/2/19
if (eigen.fix) {
eh <- eigen(Hp,symmetric=TRUE);
ev <- abs(eh$values)
Hp <- eh$vectors%*%(ev*t(eh$vectors))
} else {
Ib <- diag(rank)*abs(min(D))
Ip <- diag(rank)*abs(max(D)*.Machine$double.eps^.5)
Hp <- Hp + Ip + Ib
}
D <- rep(1,ncol(Hp)) ## 2/2/19
indefinite <- TRUE
} else { ## Hessian could be +ve def in which case Choleski is cheap!
D <- D^-.5 ## diagonal pre-conditioner
Hp <- D*t(D*Hp) ## pre-condition Hp
Ip <- diag(rank)*.Machine$double.eps^.5
}
L <- suppressWarnings(chol(Hp,pivot=TRUE))
mult <- 1
while (attr(L,"rank") < rank) { ## rank deficient - add ridge penalty
if (eigen.fix) {
eh <- eigen(Hp,symmetric=TRUE);ev <- eh$values
thresh <- max(min(ev[ev>0]),max(ev)*1e-6)*mult
mult <- mult*10
ev[ev<thresh] <- thresh
Hp <- eh$vectors%*%(ev*t(eh$vectors))
L <- suppressWarnings(chol(Hp,pivot=TRUE))
} else {
L <- suppressWarnings(chol(Hp+Ip,pivot=TRUE))
Ip <- Ip * 100 ## increase regularization penalty
}
indefinite <- TRUE
}
piv <- attr(L,"pivot")
ipiv <- piv;ipiv[piv] <- 1:ncol(L)
if (converged) break ## converged at end of previous step, so break now L and D match Hp
#if (iconv&&!indefinite) break ## immediate convergence - bad idea - can lead to small sp changes having zero effect on objective.
step <- D*(backsolve(L,forwardsolve(t(L),(D*grad)[piv]))[ipiv])
c.norm <- sum(coef^2)
if (c.norm>0) { ## limit step length to .1 of coef length
s.norm <- sqrt(sum(step^2))
c.norm <- sqrt(c.norm)
if (s.norm > .1*c.norm) step <- step*0.1*c.norm/s.norm
}
s.norm <- sqrt(sum(step^2)) ## store for scaling steepest if needed
## try the Newton step...
coef1 <- coef + step
ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=1)
ll1 <- ll$l - drop(t(coef1)%*%St%*%coef1)/2
khalf <- 0;fac <- 2
## with ll1 < ll0 in place of ll1 <= ll0 in next line than we can repeatedly accept
## miniscule steps that do not actually improve anything.
llold <- ll ## avoid losing lbb slot on stp failure
while ((!is.finite(ll1)||ll1 <= ll0) && khalf < 25) { ## step halve until it succeeds...
step <- step/fac;coef1 <- coef + step
ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=0)
ll1 <- ll$l - (t(coef1)%*%St%*%coef1)/2
if (is.finite(ll1)&&ll1>=ll0) { ## improvement, or at least no worse.
ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=1)
}
## abort if step has made no difference...
if (max(abs(coef-coef1))<max(abs(coef))*.Machine$double.eps) khalf <- 100
khalf <- khalf + 1
if (khalf>5) fac <- 5
} ## end step halve
## Following tries steepest ascent if Newton failed. Only do this if not initially converged.
## Otherwise it is possible for SA to mess up when routine is called
## with converged parameter values - takes a tiny step that leads to a machine noise
## improvement, but pushes a gradient just over threshold, Newton then continues to fail
## and we get stuck with SA convergence rates.
## One reason to try SA (if grad not already zero) is that Newton step can become
## poor due to numerical conditioning problems, whereas SA is largely unaffected.
if (!is.finite(ll1) || (ll1<=ll0 && !iconv)) { ## switch to steepest ascent
#step <- 0.5*drop(grad)*mean(abs(coef))/mean(abs(grad))
step <- drop(grad)*s.norm/sqrt(sum(grad^2)) ## scaled to initial Newton step length
khalf <- 0
}
while ((!is.finite(ll1)||(ll1 <= ll0 && !iconv)) && khalf < 25) { ## step cut until it succeeds...
step <- step/10;coef1 <- coef + step
ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=0)
ll1 <- ll$l - (t(coef1)%*%St%*%coef1)/2
if (is.finite(ll1)&&ll1>=ll0) { ## improvement, or no worse
ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=1)
}
## abort if step has made no difference...
if (max(abs(coef-coef1))<max(abs(coef))*.Machine$double.eps) khalf <- 100 ## step gone nowhere
khalf <- khalf + 1
}
if ((is.finite(ll1)&&ll1 >= ll0&&khalf<25)||iter==control$maxit) { ## step ok. Accept and test
coef <- coef + step
grad <- ll$lb - St%*%coef
Hp <- -ll$lbb+St
## convergence test...
ok <- (iter==control$maxit || max(abs(grad)) < control$epsilon*abs(ll0))
if (ok) { ## appears to have converged
if (indefinite) { ## not a well defined maximum
if (perturbed==5) stop("indefinite penalized likelihood in gam.fit5 ")
if (iter<4||rank.checked) {
perturbed <- perturbed + 1
coef <- coef*(1+(rep_len(c(0,1),length(coef))*.02-.01)*perturbed) +
(rep_len(c(0,1),length(coef)) - 0.5 ) * mean(abs(coef))*1e-5*perturbed
ll <- llf(y,x,coef,weights,family,offset=offset,deriv=1)
ll0 <- ll$l - (t(coef)%*%St%*%coef)/2
} else {
rank.checked <- TRUE
if (penalized) {
Sb <- crossprod(Eb) ## balanced penalty
Hb <- -ll$lbb/norm(ll$lbb,"F")+Sb/norm(Sb,"F") ## balanced penalized hessian
} else Hb <- -ll$lbb/norm(ll$lbb,"F")
## apply pre-conditioning, otherwise badly scaled problems can result in
## wrong coefs being dropped...
D <- abs(diag(Hb))
D[D<1e-50] <- 1;D <- D^-.5
Hb <- t(D*Hb)*D
qrh <- qr(Hb,LAPACK=TRUE)
rank <- Rrank(qr.R(qrh))
if (rank < q) { ## rank deficient. need to drop and continue to adjust other params
drop <- sort(qrh$pivot[(rank+1):q]) ## set these params to zero
bdrop <- 1:q %in% drop ## TRUE FALSE version
## now drop the parameters and recompute ll0...
lpi <- attr(x,"lpi")
xat <- attributes(x)
xat$dim <- xat$dimnames <- NULL
coef <- coef[-drop]
St <- St[-drop,-drop]
x <- x[,-drop] ## dropping columns from model matrix
if (!is.null(lpi)) { ## need to adjust column indexes as well
ii <- (1:q)[!bdrop];ij <- rep(NA,q)
ij[ii] <- 1:length(ii) ## col i of old model matrix is col ij[i] of new
for (i in 1:length(lpi)) {
lpi[[i]] <- ij[lpi[[i]][!(lpi[[i]]%in%drop)]] # drop and shuffle up
}
} ## lpi adjustment done
if (length(xat)>0) for (i in 1:length(xat)) attr(x,names(xat)[i]) <- xat[[i]]
attr(x,"lpi") <- lpi
attr(x,"drop") <- drop ## useful if family has precomputed something from x
ll <- llf(y,x,coef,weights,family,offset=offset,deriv=1)
ll0 <- ll$l - (t(coef)%*%St%*%coef)/2
grad <- ll$lb - St%*%coef
Hp <- -ll$lbb+St
}
}
} else { ## not indefinite really converged
converged <- TRUE
# ... don't break until L and D made to match final Hp
}
} else ll0 <- ll1 ## step ok but not converged yet
} else { ## step failed.
ll <- llold ## restore old ll with lbb slot
if (is.null(drop)) bdrop <- rep(FALSE,q)
if (iconv && iter==1) { ## OK to fail on first step if apparently converged to start with
converged <- TRUE ## Note: important to check if improvement possible even if apparently
coef <- start ## converged, otherwise sp changes can lead to no sp objective change!
} else {
converged <- FALSE
## NOTE: the threshold can be unrealistic if gradient or step can't be computed to this accuracy, e.g.
## because of very large smoothing parameters - could estimate grad/step accuracy from machine zero
## perturbation of grad/step calc, but perhaps too fussy.
coefp <- coef*(1+rep_len(c(-1,1),length(coef))*.Machine$double.eps^.9)
llp <- llf(y,x,coef,weights,family,offset=offset,deriv=1)
gradp <- llp$lb - St%*%coefp
err <- min(1e-3,kappaH*max(1,mean(abs(gradp-grad))/mean(abs(coefp-coef)))*.Machine$double.eps)
## err is an estimate of the acheivable relative error, capped above at 1e-3 to ensure
## this level of stability loss gets reported!
if (max(abs(grad/drop(ll0)))>max(err,control$epsilon*2)) warn[[length(warn)+1]] <-
paste("gam.fit5 step failed: max magnitude relative grad =",max(abs(grad/drop(ll0))))
}
break ## no need to recompute L and D, so break now
}
} ## end of main fitting iteration
## at this stage the Hessian (of pen lik. w.r.t. coefs) should be +ve semi definite,
## so that the pivoted Choleski factor should exist...
if (iter == 2*control$maxit&&converged==FALSE)
warn[[length(warn)+1]] <- gettextf("gam.fit5 iteration limit reached: max abs grad = %g",max(abs(grad)))
ldetHp <- 2*sum(log(diag(L))) - 2 * sum(log(D)) ## log |Hp|
if (!is.null(drop)) { ## create full version of coef with zeros for unidentifiable
fcoef <- rep(0,length(bdrop));fcoef[!bdrop] <- coef
} else fcoef <- coef
dVkk <- d1l <- d2l <- d1bSb <- d2bSb <- d1b <- d2b <- d1ldetH <- d2ldetH <- d1b <- d2b <- NULL
ncv <- scoreType=="NCV"
if (deriv>0) { ## Implicit differentiation for derivs...
m <- nSp
d1b <- matrix(0,rank,m)
Sib <- Sl.termMult(rp$Sl,fcoef,full=TRUE) ## list of penalties times coefs
if (nSp) for (i in 1:m) d1b[,i] <-
-D*(backsolve(L,forwardsolve(t(L),(D*Sib[[i]][!bdrop])[piv]))[ipiv])
## obtain the curvature check matrix...
dVkk <- crossprod(L[,ipiv]%*%(d1b/D))
if (!is.null(drop)) { ## create full version of d1b with zeros for unidentifiable
fd1b <- matrix(0,q,m)
fd1b[!bdrop,] <- d1b
} else fd1b <- d1b
## Now call the family again to get first derivative of Hessian w.r.t
## smoothing parameters, in list d1H. Alternatively, if deriv==1 and !ncv
## then tr(Hi d1H) returne as a vector in 'd1H'.
ll <- if (ncv) llf(y,x,coef,weights,family,offset=offset,deriv=3,d1b=d1b,ncv=TRUE) else
llf(y,x,coef,weights,family,offset=offset,deriv=2+(deriv>1),d1b=d1b,fh=D*t(D*chol2inv(L)[ipiv,ipiv]))
# d1l <- colSums(ll$lb*d1b) # cancels
if (deriv>1) { ## Implicit differentiation for the second derivatives is now possible...
d2b <- matrix(0,rank,m*(m+1)/2)
k <- 0
for (i in 1:m) for (j in i:m) {
k <- k + 1
v <- -ll$d1H[[i]]%*%d1b[,j] + Sl.mult(rp$Sl,fd1b[,j],i)[!bdrop] + Sl.mult(rp$Sl,fd1b[,i],j)[!bdrop]
d2b[,k] <- -D*(backsolve(L,forwardsolve(t(L),(D*v)[piv]))[ipiv])
if (i==j) d2b[,k] <- d2b[,k] + d1b[,i]
}
## Now call family for last time to get trHid2H the tr(H^{-1} d^2 H / drho_i drho_j)...
llr <- llf(y,x,coef,weights,family,offset=offset,deriv=4,d1b=d1b,d2b=d2b,
Hp=Hp,rank=rank,fh = L,D=D)
## Now compute Hessian of log lik w.r.t. log sps using chain rule
# d2la <- colSums(ll$lb*d2b) # cancels
# k <- 0
d2l <- matrix(0,m,m)
for (i in 1:m) for (j in i:m) {
# k <- k + 1
d2l[j,i] <- d2l[i,j] <- # d2la[k] + # cancels
t(d1b[,i])%*%ll$lbb%*%d1b[,j]
}
} ## if (deriv > 1)
} ## if (deriv > 0)
Vg <- NULL ## jackknife grad matrix in NCV case
if (scoreType=="NCV") {
REML <- REML1 <- REML2 <- NULL
if (deriv==0) ll <- llf(y,x,coef,weights,family,offset=offset,deriv=1,d1b=d1b,ncv=TRUE) ## otherwise l1, l2 not returned
ncv <- family$ncv ## helps debugging!
deriv1 <- if (nei$jackknife>2) -1 else if (deriv==0) 0 else 1
## create nei if null - now in estimate.gam
#if (is.null(nei)||is.null(nei$k)||is.null(nei$m)) nei <- list(i=1:nobs,mi=1:nobs,m=1:nobs,k=1:nobs) ## LOOCV
#if (is.null(nei$i)) if (length(nei$m)==nobs) nei$mi <- nei$i <- 1:nobs else stop("unclear which points NCV neighbourhoods belong to")
#if (length(nei$mi)!=length(nei$m)) stop("for NCV number of dropped and predicted neighbourhoods must match")
## complete dH
if (deriv>0) {
for (i in 1:length(ll$d1H)) ll$d1H[[i]] <- ll$d1H[[i]] - Sl.mult(rp$Sl,diag(q),i)[!bdrop,!bdrop]
}
R1 <- try(chol(t(Hp/D)/D),silent=TRUE)
ll$gamma <- gamma;
## note: use of quadratic approx to NCV signalled by family$qapprox
#if (overlap||inherits(R1,"try-error"))
if (inherits(R1,"try-error")) {
## get H (Hp?) and Hi
Hi <- t(D*chol2inv(L)[ipiv,ipiv])*D
ret <- ncv(x,y,weights,nei,coef,family,ll,H=t(Hp/D)/D,Hi=Hi,offset=offset,dH=ll$d1H,
db=d1b,deriv=deriv1)
warn[[length(warn)+1]] <- "NCV update positive definite check not possible"
} else { ## cholesky version
ret <- ncv(x,y,weights,nei,coef,family,ll,R=R1,offset=offset,dH=ll$d1H,db=d1b,
deriv=deriv1,nt=control$ncv.threads)
if (ret$error>0) warn[[length(warn)+1]] <- "some NCV updates not positive definite"
}
NCV <- ret$NCV
NCV1 <- ret$NCV1
Vg <- ret$Vg
if (deriv1<0) { ## Jackknife cov matrix...
nk <- c(nei$m[1],diff(nei$m)) ## dropped fold sizes
jkw <- sqrt((nobs-nk)/(nobs*nk)) ## jackknife weights
dd <- attr(ret$NCV,"deta.cv")
#dd <-jkw*t(dd)%*%t(T)
dd <- Sl.repa(rp$rp,t(dd),l=-1) ## undo repara
dd <- Sl.inirep(Sl,dd,l=1) ## undo initial repara
#Vj <- tcrossprod(dd) ## jackknife cov matrix
#attr(NCV,"Vj") <- Vj
attr(NCV,"dd") <- jkw*t(dd)
}
} else { ## REML required
NCV <- NCV1 <- NULL
## Compute the derivatives of log|H+S|...
if (deriv > 0) { ## only first derivative wrt rho required...
if (deriv==1&&!is.list(ll$d1H)) {
d1ldetH <- -ll$d1H
for (i in 1:m) {
A <- Sl.mult(rp$Sl,diag(q),i,full=TRUE)[!bdrop,!bdrop]
bind <- rowSums(abs(A))!=0 ## row/cols of non-zero block
A <- A[,bind,drop=FALSE] ## drop the zero columns
A <- D*(backsolve(L,forwardsolve(t(L),(D*A)[piv,]))[ipiv,,drop=FALSE])
d1ldetH[i] <- d1ldetH[i] + sum(diag(A[bind,,drop=FALSE]))
}
} else { ## deriv>1 || is.list(ll$d1H)
## computation of d1ldetH using full d1H derivative of Hessian w.r.t. sp matrices...
d1ldetH <- rep(0,m)
d1Hp <- list()
for (i in 1:m) {
A <- -ll$d1H[[i]] + Sl.mult(rp$Sl,diag(q),i)[!bdrop,!bdrop]
d1Hp[[i]] <- D*(backsolve(L,forwardsolve(t(L),(D*A)[piv,]))[ipiv,,drop=FALSE])
d1ldetH[i] <- sum(diag(d1Hp[[i]]))
}
}
} ## if (deriv >0)
if (deriv > 1) {
## second derivatives...
d2ldetH <- matrix(0,m,m)
k <- 0
for (i in 1:m) for (j in i:m) {
k <- k + 1
d2ldetH[i,j] <- -sum(d1Hp[[i]]*t(d1Hp[[j]])) - llr$trHid2H[k]
if (i==j) { ## need to add term relating to smoothing penalty
A <- Sl.mult(rp$Sl,diag(q),i,full=TRUE)[!bdrop,!bdrop]
bind <- rowSums(abs(A))!=0 ## row/cols of non-zero block
A <- A[,bind,drop=FALSE] ## drop the zero columns
A <- D*(backsolve(L,forwardsolve(t(L),(D*A)[piv,]))[ipiv,,drop=FALSE])
d2ldetH[i,j] <- d2ldetH[i,j] + sum(diag(A[bind,,drop=FALSE]))
} else d2ldetH[j,i] <- d2ldetH[i,j]
}
} ## if (deriv > 1)
## Compute derivs of b'Sb...
if (deriv>0) {
Skb <- Sl.termMult(rp$Sl,fcoef,full=TRUE)
d1bSb <- rep(0,m)
for (i in 1:m) {
Skb[[i]] <- Skb[[i]][!bdrop]
d1bSb[i] <- sum(coef*Skb[[i]])
}
}
if (deriv>1) {
d2bSb <- matrix(0,m,m)
for (i in 1:m) {
Sd1b <- St%*%d1b[,i]
for (j in i:m) {
d2bSb[j,i] <- d2bSb[i,j] <- 2*sum(
d1b[,i]*Skb[[j]] + d1b[,j]*Skb[[i]] + d1b[,j]*Sd1b)
}
d2bSb[i,i] <- d2bSb[i,i] + sum(coef*Skb[[i]])
}
}
## get grad and Hessian of REML score...
REML <- -as.numeric((ll$l - drop(t(coef)%*%St%*%coef)/2)/gamma + rp$ldetS/2 - ldetHp/2 +
Mp*(log(2*pi)/2)-log(gamma)/2)
REML1 <- if (deriv<1) NULL else -as.numeric( # d1l # cancels
- d1bSb/(2*gamma) + rp$ldet1/2 - d1ldetH/2 )
REML2 <- if (deriv<2) NULL else -( (d2l - d2bSb/2)/gamma + rp$ldet2/2 - d2ldetH/2 )
} ## REML computations
if (control$trace) {
cat("\niter =",iter," ll =",ll$l," REML =",REML," bSb =",t(coef)%*%St%*%coef/2,"\n")
cat("log|S| =",rp$ldetS," log|H+S| =",ldetHp," n.drop =",length(drop),"\n")
if (!is.null(REML1)) cat("REML1 =",REML1,"\n")
}
## Get possibly multiple linear predictors
lpi <- attr(x,"lpi")
if (is.null(lpi)) { ## only one...
linear.predictors <- if (is.null(offset)) as.numeric(x%*%coef) else as.numeric(x%*%coef+offset)
fitted.values <- family$linkinv(linear.predictors)
} else { ## multiple...
fitted.values <- linear.predictors <- matrix(0,nrow(x),length(lpi))
if (!is.null(offset)) offset[[length(lpi)+1]] <- 0
for (j in 1:length(lpi)) {
linear.predictors[,j] <- as.numeric(x[,lpi[[j]],drop=FALSE] %*% coef[lpi[[j]]])
if (!is.null(offset[[j]])) linear.predictors[,j] <- linear.predictors[,j] + offset[[j]]
fitted.values[,j] <- family$linfo[[j]]$linkinv( linear.predictors[,j])
}
}
coef <- Sl.repara(rp$rp,fcoef,inverse=TRUE) ## undo re-parameterization of coef
#coef <- Sl.repa(rp$rp,fcoef,l=-1)
if (!is.null(drop)&&!is.null(d1b)) { ## create full version of d1b with zeros for unidentifiable
db.drho <- matrix(0,length(bdrop),ncol(d1b));db.drho[!bdrop,] <- d1b
} else db.drho <- d1b
## and undo re-para...
#if (!is.null(d1b)) db.drho <- t(Sl.repara(rp$rp,t(db.drho),inverse=TRUE,both.sides=FALSE)) ## wrong
if (!is.null(d1b)) db.drho <- Sl.repa(rp$rp,db.drho,l=-1)
#if (!is.null(d2b)) d2b <- Sl.repa(rp$rp,d2b,l=-1) ## NOTE: DEBUG only
## Following needed for debugging H derivatives if Cholesky stabilization used
#if (!is.null(ll$d1H)) for (i in 1:length(ll$d1H)) ll$d1H[[i]] <- Sl.repa(rp$rp,ll$d1H[[i]],l=2,r=1) ## debug
#ll$lbb <- Sl.repa(rp$rp,ll$lbb,l=2,r=1) ## debug
ret <- list(coefficients=coef,family=family,y=y,prior.weights=weights,
fitted.values=fitted.values, linear.predictors=linear.predictors,
scale.est=1, ### NOTE: needed by newton, but what is sensible here?
REML= REML,REML1= REML1,REML2=REML2,NCV=NCV,NCV1=NCV1,
rank=rank,aic = -2*ll$l, ## 2*edf needs to be added
##deviance = -2*ll$l,
l= ll$l,## l1 =d1l,l2 =d2l,
lbb = ll$lbb, ## Hessian of log likelihood
L=L, ## chol factor of pre-conditioned penalized hessian
bdrop=bdrop, ## logical index of dropped parameters
D=D, ## diagonal preconditioning matrix
St=St, ## total penalty matrix
rp = rp$rp,Vg=Vg,
db.drho = db.drho, ## derivative of penalty coefs w.r.t. log sps.
#bSb = bSb, bSb1 = d1bSb,bSb2 = d2bSb,
S1=rp$ldet1,
#S=rp$ldetS,S1=rp$ldet1,S2=rp$ldet2,
#Hp=ldetHp,Hp1=d1ldetH,Hp2=d2ldetH,
#b2 = d2b,
iter=iter,H = ll$lbb,dH = ll$d1H,dVkk=dVkk,warn=warn)#,d2H=llr$d2H)
## debugging code to allow components of 2nd deriv of hessian w.r.t. sp.s
## to be passed to deriv.check....
#if (!is.null(ll$ghost1)&&!is.null(ll$ghost2)) {
# ret$ghost1 <- ll$ghost1; ret$ghost2 <- ret$ghost2
#}
ret
} ## end of gam.fit5
efsud <- function(x,y,lsp,Sl,weights=NULL,offset=NULL,family,
control=gam.control(),Mp=-1,start=NULL) {
## Extended Fellner-Schall method for general families
## tr(S^-S_j) is returned by ldetS as ldet1 - S1 from gam.fit5
## b'S_jb is computed as d1bSb in gam.fit5
## tr(V S_j) will need to be computed using Sl.termMult
## Sl returned by ldetS and Vb computed as in gam.fit5.postproc.
tol <- 1e-6
lsp <- lsp + 2.5
mult <- 1
fit <- gam.fit5(x=x,y=y,lsp=lsp,Sl=Sl,weights=weights,offset=offset,deriv=0,family=family,
control=control,Mp=Mp,start=start,gamma=1)
score.hist <- rep(0,200)
tiny <- .Machine$double.eps^.5 ## used to bound above zero
for (iter in 1:200) {
start <- fit$coefficients
## obtain Vb...
ipiv <- piv <- attr(fit$L,"pivot")
p <- length(piv)
ipiv[piv] <- 1:p
Vb <- crossprod(forwardsolve(t(fit$L),diag(fit$D,nrow=p)[piv,,drop=FALSE])[ipiv,,drop=FALSE])
if (sum(fit$bdrop)) { ## some coefficients were dropped...
q <- length(fit$bdrop)
ibd <- !fit$bdrop
Vtemp <- Vb; Vb <- matrix(0,q,q)
Vb[ibd,ibd] <- Vtemp
}
Vb <- Sl.repara(fit$rp,Vb,inverse=TRUE)
SVb <- Sl.termMult(Sl,Vb) ## this could be made more efficient
trVS <- rep(0,length(SVb))
for (i in 1:length(SVb)) {
ind <- attr(SVb[[i]],"ind")
trVS[i] <- sum(diag(SVb[[i]][,ind]))
}
Sb <- Sl.termMult(Sl,start,full=TRUE)
bSb <- rep(0,length(Sb))
for (i in 1:length(Sb)) {
bSb[i] <- sum(start*Sb[[i]])
}
a <- pmax(tiny,fit$S1*exp(-lsp) - trVS)
r <- a/pmax(tiny,bSb)
r[a==0&bSb==0] <- 1
r[!is.finite(r)] <- 1e6
lsp1 <- pmin(lsp + log(r)*mult,control$efs.lspmax)
max.step <- max(abs(lsp1-lsp))
old.reml <- fit$REML
fit <- gam.fit5(x=x,y=y,lsp=lsp1,Sl=Sl,weights=weights,offset=offset,deriv=0,
family=family,control=control,Mp=Mp,start=start,gamma=1)
## some step length control...
if (fit$REML<=old.reml) { ## improvement
if (max.step<.05) { ## consider step extension
lsp2 <- pmin(lsp + log(r)*mult*2,12) ## try extending step...
fit2 <- gam.fit5(x=x,y=y,lsp=lsp2,Sl=Sl,weights=weights,offset=offset,deriv=0,family=family,
control=control,Mp=Mp,start=start,gamma=1)
if (fit2$REML < fit$REML) { ## improvement - accept extension
fit <- fit2;lsp <- lsp2
mult <- mult * 2
} else { ## accept old step
lsp <- lsp1
}
} else lsp <- lsp1
} else { ## no improvement
while (fit$REML > old.reml&&mult>1) { ## don't contract below 1 as update doesn't have to improve REML
mult <- mult/2 ## contract step
lsp1 <- pmin(lsp + log(r)*mult,control$efs.lspmax)
fit <- gam.fit5(x=x,y=y,lsp=lsp1,Sl=Sl,weights=weights,offset=offset,deriv=0,family=family,
control=control,Mp=Mp,start=start,gamma=1)
}
lsp <- lsp1
if (mult<1) mult <- 1
}
score.hist[iter] <- fit$REML
## break if EFS step small and REML change negligible over last 3 steps.
if (iter>3 && max.step<.05 && max(abs(diff(score.hist[(iter-3):iter])))<control$efs.tol) break
## or break if log lik not changing...
if (iter==1) old.ll <- fit$l else {
if (abs(old.ll-fit$l)<100*control$eps*abs(fit$l)) break
old.ll <- fit$l
}
}
fit$sp <- exp(lsp)
fit$iter <- iter
fit$outer.info <- list(iter = iter,score.hist=score.hist[1:iter])
fit$outer.info$conv <- if (iter==200) "iteration limit reached" else "full convergence"
if (!is.null(fit$warn)&&length(fit$warn)>0) for (i in 1:length(fit$warn)) warning(fit$warn[[i]])
fit
} ## efsud
gam.fit5.post.proc <- function(object,Sl,L,lsp0,S,off,gamma) {
## object is object returned by gam.fit5, Sl is penalty object, L maps working sp
## vector to full sp vector
## Computes:
## R - unpivoted Choleski of estimated expected hessian of ll
## Vb - the Bayesian cov matrix,
## Ve - "frequentist" alternative
## F - the EDF matrix
## edf = diag(F) and edf2 = diag(2F-FF)
## Main issue is that lbb and lbb + St must be +ve definite for
## F to make sense.
## NOTE: what comes in is in stabilizing parameterization from
## gam.fit5, and may have had parameters dropped.
## possibly initial reparam needs to be undone here as well
## before formation of F....
lbb <- -object$lbb ## Hessian of log likelihood in fit parameterization
p <- ncol(lbb)
ipiv <- piv <- attr(object$L,"pivot")
ipiv[piv] <- 1:p
## Vb0 <- crossprod(forwardsolve(t(object$L),diag(object$D,nrow=p)[piv,])[ipiv,])
## Bayes cov matrix with learning rate 1/gamma. Wrong - parameterization s.t. Vp*gamma is it
#Vl <- if (gamma==1) NULL else chol2inv(chol(-object$lbb/gamma+object$St))
## need to pre-condition lbb before testing rank...
lbb <- object$D*t(object$D*lbb)
R <- suppressWarnings(chol(lbb,pivot=TRUE))
if (attr(R,"rank") < ncol(R)) {
## The hessian of the -ve log likelihood is not +ve definite
## Find the "nearest" +ve semi-definite version and use that
retry <- TRUE;tol <- 0
eh <- eigen(lbb,symmetric=TRUE)
mev <- max(eh$values);dtol <- 1e-7
while (retry) {
eh$values[eh$values<tol*mev] <- tol*mev
R <- sqrt(eh$values)*t(eh$vectors)
lbb <- crossprod(R)
Hp <- lbb + object$D*t(object$D*object$St) ## pre-conditioned Hp
## Now try to invert it by Choleski with diagonal pre-cond,
## to get Vb
object$L <- suppressWarnings(chol(Hp,pivot=TRUE))
if (attr(object$L,"rank")==ncol(Hp)) {
R <- t(t(R)/object$D) ## so R'R = lbb (original)
retry <- FALSE
} else { ## failure: make more +ve def
tol <- tol + dtol;dtol <- dtol*10
}
} ## retry
} else { ## hessian +ve def, so can simply use what comes from fit directly
ipiv <- piv <- attr(R,"pivot")
ipiv[piv] <- 1:p
R <- t(t(R[,ipiv])/object$D) ## so now t(R)%*%R = lbb (original)
}
## DL'LD = penalized Hessian, which needs to be inverted
## to DiLiLi'Di = Vb, the Bayesian cov matrix...
ipiv <- piv <- attr(object$L,"pivot")
ipiv[piv] <- 1:p
Vb <- crossprod(forwardsolve(t(object$L),diag(object$D,nrow=p)[piv,,drop=FALSE])[ipiv,,drop=FALSE])
## Insert any zeroes required as a result of dropping
## unidentifiable parameters...
if (sum(object$bdrop)) { ## some coefficients were dropped...
q <- length(object$bdrop)
ibd <- !object$bdrop
Vtemp <- Vb; Vb <- matrix(0,q,q)
Vb[ibd,ibd] <- Vtemp
Rtemp <- R; R <- matrix(0,q,q)
R[ibd,ibd] <- Rtemp
lbbt <- lbb;lbb <- matrix(0,q,q)
lbb[ibd,ibd] <- lbbt
}
edge.correct <- FALSE
## compute the smoothing parameter uncertainty correction...
if (!is.null(object$outer.info$hess)&&!is.null(object$db.drho)) {
hess <- object$outer.info$hess
edge.correct <- if (is.null(attr(hess,"edge.correct"))) FALSE else TRUE
K <- if (edge.correct) 2 else 1
for (k in 1:K) {
if (k==1) { ## fitted model computations
db.drho <- object$db.drho
dw.drho <- object$dw.drho
lsp <- log(object$sp)
} else { ## edge corrected model computations
db.drho <- attr(hess,"db.drho1")
dw.drho <- attr(hess,"dw.drho1")
lsp <- attr(hess,"lsp1")
hess <- attr(hess,"hess1")
}
if (!is.null(L)) db.drho <- db.drho%*%L ## transform to derivs w.r.t. working
ev <- eigen(hess,symmetric=TRUE)
d <- ev$values;ind <- d <= 0
d[ind] <- 0;d[!ind] <- 1/sqrt(d[!ind])
db.drho <- Sl.inirep(Sl,db.drho,1,0)
Vc <- crossprod((d*t(ev$vectors))%*%t(db.drho)) ## first correction
d <- ev$values; d[ind] <- 0;
d <- if (k==1) 1/sqrt(d+1/50) else 1/sqrt(d+1e-7)
Vr <- crossprod(d*t(ev$vectors))
if (k==1) {
Vc1 <- Vc; Vr1 <- Vr; lsp1 <- lsp ## un-shifted version to use for edf
}
## reverse the various re-parameterizations...
}
rp <- if (edge.correct) attr(object$outer.info$hess,"rp") else object$rp
#Vc <- Sl.repara(rp,Vc,inverse=TRUE) ## BUG: appears to be double correction - db already corrected??
#Vc <- Sl.initial.repara(Sl,Vc,inverse=TRUE)
V.sp <- Vr;attr(V.sp,"L") <- L
} else {
Vc <- 0; db.drho <- object$db.drho
V.sp <- NULL
}
Vb <- Sl.repara(object$rp,Vb,inverse=TRUE)
Vb <- Sl.initial.repara(Sl,Vb,inverse=TRUE)
Vc <- Vb + Vc
if (edge.correct) {
Vc1 <- Sl.repara(object$rp,Vc1,inverse=TRUE)
Vc1 <- Sl.initial.repara(Sl,Vc1,inverse=TRUE)
Vc1 <- Vb + Vc1
}
#R <- Sl.repara(object$rp,R,inverse=TRUE,both.sides=FALSE)
R <- Sl.repa(object$rp,R,r=1)
R <- Sl.initial.repara(Sl,R,inverse=TRUE,both.sides=FALSE,cov=FALSE)
F <- Vb%*%crossprod(R)
Ve <- F%*%Vb ## 'frequentist' cov matrix
edf <- diag(F)
## note that edf1 is a heuristic upper bound on EDF - it's the
## df of the best unpenalized approx to the 1st order bias corrected
## model. This is larger than edf2 should be, because of bias correction variability,
## but is bounded in a way that is not *guaranteed* for edf2. Note that
## justification only applies to sum(edf1/2) not elementwise
if (!is.null(object$outer.info$hess)&&!is.null(object$db.drho)) {
## second correction term is easier computed in original parameterization...
Vc <- Vc + Vb.corr(R,L,lsp0,S,off,dw=NULL,w=NULL,lsp,Vr)
if (edge.correct) Vc1 <- Vc1 +
Vb.corr(R,L,lsp0,S,off,dw=NULL,w=NULL,lsp1,Vr1) else Vc1 <- Vc
}
edf1 <- 2*edf - rowSums(t(F)*F)
edf2 <- if (edge.correct) rowSums(Vc1*crossprod(R)) else rowSums(Vc*crossprod(R))
if (sum(edf2)>sum(edf1)) edf2 <- edf1
## note hat not possible here...
list(Vc=Vc,Vp=Vb,Ve=Ve,V.sp=V.sp,edf=edf,edf1=edf1,edf2=edf2,#F=F,
R=R,db.drho=db.drho)
} ## gam.fit5.post.proc
deriv.check5 <- function(x, y, sp,
weights = rep(1, length(y)), start = NULL,
offset = rep(0, length(y)),Mp,family = gaussian(),
control = gam.control(),deriv=2,eps=1e-7,spe=1e-3,
Sl,gamma=1,nei=nei,...)
## FD checking of derivatives for gam.fit5: a debugging routine
{ if (!deriv%in%c(1,2)) stop("deriv should be 1 or 2")
if (control$epsilon>1e-9) control$epsilon <- 1e-9
## first obtain the fit corresponding to sp...
b <- gam.fit5(x=x,y=y,lsp=sp,Sl=Sl,weights=weights,offset=offset,deriv=deriv,
family=family,control=control,Mp=Mp,start=start,gamma=gamma,nei=nei)
## now get the derivatives of the likelihood w.r.t. coefs...
ll <- family$ll(y=y,X=x,coef=b$coefficients,wt=weights,family=family,
deriv=1,d1b=0,d2b=0,Hp=NULL,rank=0,fh=NULL,D=NULL)
## and finite difference versions of these...
p <- length(b$coefficients)
fdg <- rep(0,p)
fdh <- matrix(0,p,p)
for (i in 1:p) {
coef1 <- b$coefficients;coef1[i] <- coef1[i] + eps
ll1 <- family$ll(y=y,X=x,coef=coef1,wt=weights,family=family,
deriv=1,d1b=0,d2b=0,Hp=NULL,rank=0,fh=NULL,D=NULL)
fdg[i] <- (ll1$l - ll$l)/eps
fdh[,i] <- (ll1$lb - ll$lb)/eps
}
## display them...
oask <- devAskNewPage(TRUE)
on.exit(devAskNewPage(oask))
plot(ll$lb,fdg,xlab="computed",ylab="FD",main="grad of log lik");abline(0,1)
cat("log lik grad cor. =",cor(ll$lb,fdg),"\n")
plot(ll$lbb,fdh,xlab="computed",ylab="FD",main="hess of log lik");abline(0,1)
cat("log lik hess cor. =",cor(as.numeric(ll$lbb),as.numeric(fdh)),"\n")
## now we need to investigate the derivatives w.r.t. the log smoothing parameters.
M <- length(sp) ## number of smoothing parameters
fd.br <- matrix(0,p,M)
REML1 <- rep(0,M)
fd.dH <- list()
if (!is.null(b$b2)) fd.br2 <- b$b2*0
k <- 0
for (i in 1:M) { ## the smoothing parameter loop
sp0 <- sp1 <- sp;sp1[i] <- sp[i] + spe/2;sp0[i] <- sp[i] - spe/2
b0 <- gam.fit5(x=x,y=y,lsp=sp0,Sl=Sl,weights=weights,offset=offset,deriv=1,
family=family,control=control,Mp=Mp,start=start,gamma=gamma,nei=nei)
b1 <- gam.fit5(x=x,y=y,lsp=sp1,Sl=Sl,weights=weights,offset=offset,deriv=1,
family=family,control=control,Mp=Mp,start=start,gamma=gamma,nei=nei)
fd.br[,i] <- (b1$coefficients - b0$coefficients)/spe
if (!is.null(b$b2)) {
for (j in i:M) {
k <- k + 1
fd.br2[,k] <- (b1$db.drho[,j]-b0$db.drho[,j])/spe
}
}
REML1[i] <- (b1$REML-b0$REML)/spe
fd.dH[[i]] <- (b1$lbb - b0$lbb)/spe
}
## plot db.drho against fd versions...
for (i in 1:M) {
plot(b$db.drho[,i],fd.br[,i],xlab="computed",ylab="FD",main="db/drho");abline(0,1)
cat("cor db/drho[",i,"] = ",cor(b$db.drho[,i],fd.br[,i]),"\n")
}
## second deriv of b
if (!is.null(b$b2)) for (i in 1:ncol(b$b2)) {
plot(b$b2[,i],fd.br2[,i],xlab="computed",ylab="FD",main="d2b/drhorho");abline(0,1)
cat("cor d2b[",i,"] = ",cor(b$b2[,i],fd.br2[,i]),"\n")
}
## plot first deriv Hessian against FD version
for (i in 1:M) {
plot(b$dH[[i]],fd.dH[[i]],xlab="computed",ylab="FD",main="dH/drho");abline(0,1)
cat("cor dH/drho[",i,"] = ",cor(as.numeric(b$dH[[i]]),as.numeric(fd.dH[[i]])),"\n")
}
list(fd=list(lb=fdg,lbb=fdh,REML1=REML1,db.drho=fd.br,dH=fd.dH),
lb=ll$lb,lbb=ll$lbb,REML1=b$REML1,db.drho=b$db.drho,dH=b$dH)
} ## deriv.check5
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.