R/gam.fit4.r
In mgcv: Mixed GAM Computation Vehicle with Automatic Smoothness Estimation

Documented in dDeta gam.fit5.post.proc

# (c) Simon N. Wood (2013-2022). Provided under GPL 2.
## Routines for gam estimation beyond exponential family.

dDeta <- function(y,mu,wt,theta,fam,deriv=0) {
## What is available directly from the family are derivatives of the 
## deviance and link w.r.t. mu. This routine converts these to the
## required derivatives of the deviance w.r.t. eta.
## deriv is the order of derivative of the smoothing parameter score 
## required.
## This version is based on ratios of derivatives of links rather 
## than raw derivatives of links. g2g = g''/g'^2, g3g = g'''/g'^3 etc 
   r <- fam$Dd(y, mu, theta, wt, level=deriv)  
   d <- list(Deta=0,Dth=0,Dth2=0,Deta2=0,EDeta2=0,Detath=0,
             Deta3=0,Deta2th=0,Detath2=0,
             Deta4=0,Deta3th=0,Deta2th2=0)
   if (fam$link=="identity") { ## don't waste time on transformation
      d$Deta <- r$Dmu;d$Deta2 <- r$Dmu2
      d$EDeta2 <- r$EDmu2;d$Deta.Deta2 <- r$Dmu/r$Dmu2
      d$Deta.EDeta2 <- r$Dmu/r$EDmu2
      if (deriv>0) {
        d$Dth <- r$Dth; d$Detath <- r$Dmuth
        d$Deta3 <- r$Dmu3; d$Deta2th <- r$Dmu2th
	d$EDeta2th <- r$EDmu2th;d$EDeta3 <- r$EDmu3
      }
      if (deriv>1) {
        d$Deta4 <- r$Dmu4; d$Dth2 <- r$Dth2; d$Detath2 <- r$Dmuth2
        d$Deta2th2 <- r$Dmu2th2; d$Deta3th <- r$Dmu3th
      }
   } else {

     ig1 <- fam$mu.eta(fam$linkfun(mu)) 
     ig12 <- ig1^2
    
     g2g <- fam$g2g(mu)

     d$Deta <- r$Dmu * ig1
     d$Deta2 <- r$Dmu2*ig12 - r$Dmu*g2g*ig1
     d$EDeta2 <- r$EDmu2*ig12
     d$Deta.Deta2 <- r$Dmu/(r$Dmu2*ig1 - r$Dmu*g2g)
     d$Deta.EDeta2 <- r$Dmu/(r$EDmu2*ig1)
     if (deriv>0) {
        ig13 <- ig12 * ig1
        d$Dth <- r$Dth 
        d$Detath <- r$Dmuth * ig1
        g3g <- fam$g3g(mu)
        d$Deta3 <- r$Dmu3*ig13 - 3*r$Dmu2 * g2g * ig12 + r$Dmu * (3*g2g^2 - g3g)*ig1
        if (!is.null(r$EDmu3)) d$EDeta3 <- r$EDmu3*ig13 - 3*r$EDmu2 * g2g * ig12 ## EDmu=0
        d$Deta2th <- r$Dmu2th*ig12 - r$Dmuth*g2g*ig1
        if (!is.null(r$EDmu2th)) d$EDeta2th <- r$EDmu2th*ig12 ##- r$EDmuth*g2g*ig1
     }
     if (deriv>1) {
       g4g <- fam$g4g(mu)
       d$Deta4 <- ig12^2*r$Dmu4 - 6*r$Dmu3*ig13*g2g + r$Dmu2*(15*g2g^2-4*g3g)*ig12 - 
                       r$Dmu*(15*g2g^3-10*g2g*g3g  +g4g)*ig1
       d$Dth2 <- r$Dth2
       d$Detath2 <- r$Dmuth2 * ig1 
       d$Deta2th2 <- ig12*r$Dmu2th2 - r$Dmuth2*g2g*ig1
       d$Deta3th <-  ig13*r$Dmu3th - 3 *r$Dmu2th*g2g*ig12 + r$Dmuth*(3*g2g^2-g3g)*ig1
     }
   } ## end of non identity
   good <- is.finite(d$Deta)&is.finite(d$Deta2)
   if (deriv>0) {
     nth <- length(theta)
     if (nth>1) good <- good&is.finite(rowSums(d$Dth))&is.finite(rowSums(d$Detath))&
                                 is.finite(rowSums(d$Deta2th))&is.finite(d$Deta3) else
     good <- good & is.finite(d$Deta3) & if (nth==0) TRUE else
             is.finite(d$Dth)&is.finite(d$Detath)&is.finite(d$Deta2th)
     if (deriv>1) { 
       if (nth>1) good <- good&is.finite(rowSums(d$Dth2))&is.finite(rowSums(d$Detath2))&is.finite(rowSums(d$Deta2th2))&
                          is.finite(rowSums(d$Deta3th))&is.finite(d$Deta4) else
       good <- good & is.finite(d$Deta4) & if (nth==0) TRUE else
               is.finite(d$Dth2)&is.finite(d$Detath2)&is.finite(d$Deta2th2)&is.finite(d$Deta3th)
     }
   }	   
   d$good <- good	   
   d
} ## dDeta

fetad.test <- function(y,mu,wt,theta,fam,eps = 1e-7,plot=TRUE) {
## test family derivatives w.r.t. eta
  
  dd <- dDeta(y,mu,wt,theta,fam,deriv=2)
  dev <- fam$dev.resids(y, mu, wt,theta)
  mu1 <- fam$linkinv(fam$linkfun(mu)+eps)
  dev1 <- fam$dev.resids(y,mu1, wt,theta)
  Deta.fd <- (dev1-dev)/eps
  cat("Deta: rdiff = ",range(dd$Deta-Deta.fd)," cor = ",cor(dd$Deta,Deta.fd),"\n")
  plot(dd$Deta,Deta.fd);abline(0,1)
  nt <- length(theta)
  if (fam$n.theta && nt>0) for (i in 1:nt) {
    th1 <- theta;th1[i] <- th1[i] + eps
    dev1 <- fam$dev.resids(y, mu, wt,th1)   
    Dth.fd <- (dev1-dev)/eps
    um <- if (nt>1) dd$Dth[,i] else dd$Dth
    cat("Dth[",i,"]: rdiff = ",range(um-Dth.fd)," cor = ",cor(um,Dth.fd),"\n")
    plot(um,Dth.fd);abline(0,1)
  }
  ## second order up...
  dd1 <- dDeta(y,mu1,wt,theta,fam,deriv=2)
  Deta2.fd <- (dd1$Deta - dd$Deta)/eps
  cat("Deta2: rdiff = ",range(dd$Deta2-Deta2.fd)," cor = ",cor(dd$Deta2,Deta2.fd),"\n")
  plot(dd$Deta2,Deta2.fd);abline(0,1)
  Deta3.fd <- (dd1$Deta2 - dd$Deta2)/eps
  cat("Deta3: rdiff = ",range(dd$Deta3-Deta3.fd)," cor = ",cor(dd$Deta3,Deta3.fd),"\n")
  plot(dd$Deta3,Deta3.fd);abline(0,1)
  Deta4.fd <- (dd1$Deta3 - dd$Deta3)/eps
  cat("Deta4: rdiff = ",range(dd$Deta4-Deta4.fd)," cor = ",cor(dd$Deta4,Deta4.fd),"\n")
  plot(dd$Deta4,Deta4.fd);abline(0,1)
  ## and now the higher derivs wrt theta...
  ind <- 1:nt
  if (fam$n.theta && nt>0) for (i in 1:nt) {
    th1 <- theta;th1[i] <- th1[i] + eps
    dd1 <- dDeta(y,mu,wt,th1,fam,deriv=2)
    Detath.fd <- (dd1$Deta - dd$Deta)/eps 
    um <- if (nt>1) dd$Detath[,i] else dd$Detath
    cat("Detath[",i,"]: rdiff = ",range(um-Detath.fd)," cor = ",cor(um,Detath.fd),"\n")
    plot(um,Detath.fd);abline(0,1)
    Deta2th.fd <- (dd1$Deta2 - dd$Deta2)/eps
    um <- if (nt>1) dd$Deta2th[,i] else dd$Deta2th
    cat("Deta2th[",i,"]: rdiff = ",range(um-Deta2th.fd)," cor = ",cor(um,Deta2th.fd),"\n") 
    plot(um,Deta2th.fd);abline(0,1)
    Deta3th.fd <- (dd1$Deta3 - dd$Deta3)/eps
    um <- if (nt>1) dd$Deta3th[,i] else dd$Deta3th
    cat("Deta3th[",i,"]: rdiff = ",range(um-Deta3th.fd)," cor = ",cor(um,Deta3th.fd),"\n")
    plot(um,Deta3th.fd);abline(0,1)
    ## now the 3 second derivative w.r.t. theta terms

    Dth2.fd <- (dd1$Dth - dd$Dth)/eps
    um <- if (nt>1) dd$Dth2[,ind] else dd$Dth2
    er <- if (nt>1) Dth2.fd[,i:nt] else Dth2.fd
    cat("Dth2[",i,",]: rdiff = ",range(um-er)," cor = ",cor(as.numeric(um),as.numeric(er)),"\n")
    plot(um,er);abline(0,1)
    Detath2.fd <- (dd1$Detath - dd$Detath)/eps
    um <- if (nt>1) dd$Detath2[,ind] else dd$Detath2
    er <- if (nt>1) Detath2.fd[,i:nt] else Detath2.fd
    cat("Detath2[",i,",]: rdiff = ",range(um-er)," cor = ",cor(as.numeric(um),as.numeric(er)),"\n")
    ## cat("Detath2[",i,",]: rdiff = ",range(dd$Detath2-Detath2.fd)," cor = ",cor(dd$Detath2,Detath2.fd),"\n")
    plot(um,er);abline(0,1)
 
    Deta2th2.fd <- (dd1$Deta2th - dd$Deta2th)/eps
    um <- if (nt>1) dd$Deta2th2[,ind] else dd$Deta2th2
    er <- if (nt>1) Deta2th2.fd[,i:nt] else Deta2th2.fd
    cat("Deta2th2[",i,",]: rdiff = ",range(um-er)," cor = ",cor(as.numeric(um),as.numeric(er)),"\n")
    ## cat("Deta2th2[",i,",]: rdiff = ",range(dd$Deta2th2-Deta2th2.fd)," cor = ",cor(dd$Deta2th2,Deta2th2.fd),"\n") 
    ind <- max(ind)+1:(nt-i) 
    plot(um,er);abline(0,1)
  }
} ## fetad.test

corb <- function(x,z) {
## alternative to cor for measuring similarity of x and z,
## which is not scaling invariant. So 1 really means x and z
## are very close, not just linearly related.
  d <- x-z
  1-mean(d^2)/(sd(x)*sd(z))
}

fmud.test <- function(y,mu,wt,theta,fam,eps = 1e-7,plot=TRUE) {
## test family deviance derivatives w.r.t. mu
  ## copy to make debugging easier...
  Dd <- fam$Dd;dev.resids <- fam$dev.resids 
  dd <- Dd(y, mu, theta, wt, level=2) 
  dev <- dev.resids(y, mu, wt,theta)
  dev1 <- dev.resids(y, mu+eps, wt,theta)
  Dmu.fd <- (dev1-dev)/eps
  cat("Dmu: rdiff = ",range(dd$Dmu-Dmu.fd)," cor = ",corb(dd$Dmu,Dmu.fd),"\n")
  if (plot) {
    pch <- 19;cex <- .4
    plot(dd$Dmu,Dmu.fd,pch=pch,cex=cex);abline(0,1,col=2)
    oask <- devAskNewPage(TRUE)
    on.exit(devAskNewPage(oask))
  }
  nt <- length(theta)
  if (fam$n.theta>0&&nt>0) for (i in 1:nt) {
    th1 <- theta;th1[i] <- th1[i] + eps
    dev1 <- dev.resids(y, mu, wt,th1)
    Dth.fd <- (dev1-dev)/eps
    um <- if (nt>1) dd$Dth[,i] else dd$Dth
    cat("Dth[",i,"]: rdiff = ",range(um-Dth.fd)," cor = ",corb(um,Dth.fd),"\n",sep="")
    if (plot) { plot(um,Dth.fd,pch=pch,cex=cex,xlab=paste("Dth[",i,"]",sep=""));
    abline(0,1,col=2)}
  }
  ## second order up...
  dd1 <- Dd(y, mu+eps, theta, wt, level=2)
  Dmu2.fd <- (dd1$Dmu - dd$Dmu)/eps
  cat("Dmu2: rdiff = ",range(dd$Dmu2-Dmu2.fd)," cor = ",corb(dd$Dmu2,Dmu2.fd),"\n")
  if (plot) { plot(dd$Dmu2,Dmu2.fd,pch=pch,cex=cex);abline(0,1,col=2)}
  Dmu3.fd <- (dd1$Dmu2 - dd$Dmu2)/eps
  cat("Dmu3: rdiff = ",range(dd$Dmu3-Dmu3.fd)," cor = ",corb(dd$Dmu3,Dmu3.fd),"\n")
  if (plot) { plot(dd$Dmu3,Dmu3.fd,pch=pch,cex=cex);abline(0,1,col=2)}
  Dmu4.fd <- (dd1$Dmu3 - dd$Dmu3)/eps
  cat("Dmu4: rdiff = ",range(dd$Dmu4-Dmu4.fd)," cor = ",corb(dd$Dmu4,Dmu4.fd),"\n")
  if (plot) { plot(dd$Dmu4,Dmu4.fd,pch=pch,cex=cex);abline(0,1,col=2)}
  ## and now the higher derivs wrt theta 
  ind <- 1:nt
  if (fam$n.theta>0&&nt>0) for (i in 1:nt) {
    th1 <- theta;th1[i] <- th1[i] + eps
    dd1 <- Dd(y, mu, th1, wt, level=2)
    Dmuth.fd <- (dd1$Dmu - dd$Dmu)/eps
    um <- if (nt>1) dd$Dmuth[,i] else dd$Dmuth
    cat("Dmuth[",i,"]: rdiff = ",range(um-Dmuth.fd)," cor = ",corb(um,Dmuth.fd),"\n")
    if (plot) { plot(um,Dmuth.fd,pch=pch,cex=cex,xlab=paste("Dmuth[",i,"]",sep=""));abline(0,1,col=2)}
    Dmu2th.fd <- (dd1$Dmu2 - dd$Dmu2)/eps
    um <- if (nt>1) dd$Dmu2th[,i] else dd$Dmu2th
    cat("Dmu2th[",i,"]: rdiff = ",range(um-Dmu2th.fd)," cor = ",corb(um,Dmu2th.fd),"\n")
    if (plot) { plot(um,Dmu2th.fd,pch=pch,cex=cex,xlab=paste("Dmu2th[",i,"]",sep=""));abline(0,1,col=2)}
    if (!is.null(dd$EDmu2th)) {
       EDmu2th.fd <- (dd1$EDmu2 - dd$EDmu2)/eps
       um <- if (nt>1) dd$EDmu2th[,i] else dd$EDmu2th
       cat("EDmu2th[",i,"]: rdiff = ",range(um-EDmu2th.fd)," cor = ",corb(um,EDmu2th.fd),"\n")
       if (plot) { plot(um,EDmu2th.fd,pch=pch,cex=cex,xlab=paste("EDmu2th[",i,"]",sep=""));abline(0,1,col=2)}
    }
    Dmu3th.fd <- (dd1$Dmu3 - dd$Dmu3)/eps
    um <- if (nt>1) dd$Dmu3th[,i] else dd$Dmu3th
    cat("Dmu3th[",i,"]: rdiff = ",range(um-Dmu3th.fd)," cor = ",corb(um,Dmu3th.fd),"\n")
    if (plot) { plot(um,Dmu3th.fd,pch=pch,cex=cex,xlab=paste("Dmu3th[",i,"]",sep=""));abline(0,1,col=2)}
    ## now the 3 second derivative w.r.t. theta terms...

    Dth2.fd <- (dd1$Dth - dd$Dth)/eps
    um <- if (nt>1) dd$Dth2[,ind] else dd$Dth2
    er <- if (nt>1) Dth2.fd[,i:nt] else Dth2.fd
    cat("Dth2[",i,",]: rdiff = ",range(um-er)," cor = ",corb(as.numeric(um),as.numeric(er)),"\n")
    if (plot) { plot(um,er,pch=pch,cex=cex,xlab=paste("Dth2[",i,",]",sep=""),ylab="fd");abline(0,1,col=2)}
    Dmuth2.fd <- (dd1$Dmuth - dd$Dmuth)/eps
    um <- if (nt>1) dd$Dmuth2[,ind] else dd$Dmuth2
    er <- if (nt>1) Dmuth2.fd[,i:nt] else Dmuth2.fd
    cat("Dmuth2[",i,",]: rdiff = ",range(um-er)," cor = ",corb(as.numeric(um),as.numeric(er)),"\n")
    if (plot) { plot(um,er,pch=pch,cex=cex,xlab=paste("Dmuth2[",i,",]",sep=""),ylab="fd");abline(0,1,col=2)}
    Dmu2th2.fd <- (dd1$Dmu2th - dd$Dmu2th)/eps
    um <- if (nt>1) dd$Dmu2th2[,ind] else dd$Dmu2th2
    er <- if (nt>1) Dmu2th2.fd[,i:nt] else Dmu2th2.fd
    cat("Dmu2th2[",i,",]: rdiff = ",range(um-er)," cor = ",corb(as.numeric(um),as.numeric(er)),"\n")
    if (plot) { plot(um,er,pch=pch,cex=cex,xlab=paste("Dmu2th2[",i,",]",sep=""),ylab="fd");abline(0,1,col=2)}
    ind <- max(ind)+1:(nt-i)
  }
} ## fmud.test



gam.fit4 <- function(x, y, sp, Eb,UrS=list(),
            weights = rep(1, nobs), start = NULL, etastart = NULL, 
            mustart = NULL, offset = rep(0, nobs),U1=diag(ncol(x)), Mp=-1, family = gaussian(), 
            control = gam.control(), deriv=2,gamma=1,
            scale=1,scoreType="REML",null.coef=rep(0,ncol(x)),nei=NULL,...) {
## Routine for fitting GAMs beyond exponential family.
## Inputs as gam.fit3 except that family is of class "extended.family", while
## sp contains the vector of extended family parameters, followed by the log smoothing parameters,
## followed by the log scale parameter if scale < 0

  ## some families have second derivative of deviance, and hence iterative weights
  ## very close to zero for some data. This can lead to poorly scaled sqrt(w)z
  ## and it is better to base everything on wz...
  if (is.null(family$use.wz)) family$use.wz <- FALSE
  warn <- list()
  if (family$n.theta>0) { ## there are extra parameters to estimate
    ind <- 1:family$n.theta
    theta <- sp[ind] ## parameters of the family
    family$putTheta(theta)
    sp <- sp[-ind]   ## log smoothing parameters
  } else theta <- family$getTheta() ## fixed value

  ## penalized <- if (length(UrS)>0) TRUE else FALSE

  if (scale>0) scale.known <- TRUE else {
    ## unknown scale parameter, trial value supplied as 
    ## final element of sp. 
    scale.known <- FALSE
    nsp <- length(sp)
    scale <- exp(sp[nsp])
    sp <- sp[-nsp]
  }
  
  x <- as.matrix(x)  
  nSp <- length(sp) 
  rank.tol <- .Machine$double.eps*100 ## tolerance to use for rank deficiency
  q <- ncol(x)
  n <- nobs <- nrow(x)  
  
  xnames <- dimnames(x)[[2]]
  ynames <- if (is.matrix(y)) rownames(y) else names(y)
  ## Now a stable re-parameterization is needed....

  if (length(UrS)) {
      grderiv <- if (scoreType=="EFS") 1 else deriv 
      rp <- gam.reparam(UrS,sp,grderiv)
      T <- diag(q)
      T[1:ncol(rp$Qs),1:ncol(rp$Qs)] <- rp$Qs
      T <- U1%*%T ## new params b'=T'b old params
    
      null.coef <- t(T)%*%null.coef  
     
      if (!is.null(start)) start <- t(T)%*%start

      ## form x%*%T in parallel 
      x <- .Call(C_mgcv_pmmult2,x,T,0,0,control$nthreads)
      rS <- list()
      for (i in 1:length(UrS)) {
        rS[[i]] <- rbind(rp$rS[[i]],matrix(0,Mp,ncol(rp$rS[[i]])))
      } ## square roots of penalty matrices in current parameterization
      Eb <- Eb%*%T ## balanced penalty matrix
      rows.E <- q-Mp
      Sr <- cbind(rp$E,matrix(0,nrow(rp$E),Mp))
      St <- rbind(cbind(rp$S,matrix(0,nrow(rp$S),Mp)),matrix(0,Mp,q))
  } else {
      grderiv <- 0
      T <- diag(q); 
      St <- matrix(0,q,q) 
      rSncol <- rows.E <- Eb <- Sr <- 0   
      rS <- list(0)
      rp <- list(det=0,det1 = 0,det2 = 0,fixed.penalty=FALSE)
  }

  ## re-parameterization complete. Initialization....

  nvars <- ncol(x)
  if (nvars==0) stop("emtpy models not available")
  if (is.null(weights)) weights <- rep.int(1, nobs)
  if (is.null(offset)) offset <- rep.int(0, nobs)

  linkinv <- family$linkinv
  valideta <- family$valideta
  validmu <- family$validmu
  dev.resids <- family$dev.resids

  ## need an initial `null deviance' to test for initial divergence...
  ## if (!is.null(start)) null.coef <- start - can be on edge of feasible - not good
  null.eta <- as.numeric(x%*%null.coef + as.numeric(offset))

  ## call the families initialization code...

  if (is.null(mustart)) {
    eval(family$initialize)
    mukeep <- NULL
  } else {
    mukeep <- mustart
    eval(family$initialize)
    #mustart <- mukeep
  }

  old.pdev <- sum(dev.resids(y, linkinv(null.eta), weights,theta)) + t(null.coef)%*%St%*%null.coef 

  if (!is.null(start)) { ## check it's at least better than null.coef
    pdev <- sum(dev.resids(y, linkinv(x%*%start+as.numeric(offset)), weights,theta)) + t(start)%*%St%*%start
    if (pdev>old.pdev) start <- mukeep <- etastart <- NULL
  }
  coefold <- null.coef ## set to default, may be replaced below
  etaold <- x %*% coefold + offset
  if (!is.null(mukeep)) mustart <- mukeep

  ## and now finalize initialization of mu and eta...

  eta <- if (!is.null(etastart)) etastart
         else if (!is.null(start)) 
              if (length(start) != nvars) 
                  stop("Length of start should equal ", nvars, 
                  " and correspond to initial coefs for ", deparse(xnames))
              else {
                  coefold <- start
                  etaold <- offset + as.vector(if (NCOL(x) == 1) 
                  x * start
                  else x %*% start)
              }
              else family$linkfun(mustart)
  
   mu <- linkinv(eta)
   conv <-  boundary <- FALSE
   dd <- dDeta(y,mu,weights,theta,family,0) ## derivatives of deviance w.r.t. eta
   w <- dd$Deta2 * .5
   wz <- w*(eta-offset) - .5*dd$Deta
   z <- (eta-offset) - dd$Deta.Deta2
   good <- is.finite(z)&is.finite(w)
   zg <- rep(0,max(dim(x)))

   for (iter in 1:control$maxit) { ## start of main fitting iteration 
      if (control$trace) cat(iter," ")
      if (control$trace&sum(!good)>0) cat("\n",sum(!good)," not good\n")
      if (sum(!good)) {
        use.wy <- TRUE
        good <- is.finite(w)&is.finite(wz)
        z[!is.finite(z)] <- 0 ## avoid NaN in .C call - unused anyway
      } else use.wy <- family$use.wz
      if (sum(good)==0) stop("no good data in iteration")
      ng <- sum(good)
      zg[1:ng] <- z[good] ## ensure that y dimension large enough for coefs
      oo <- .C(C_pls_fit1,   
               y=as.double(zg),X=as.double(x[good,]),w=as.double(w[good]),wy = as.double(wz[good]),
                     E=as.double(Sr),Es=as.double(Eb),n=as.integer(ng),
                     q=as.integer(ncol(x)),rE=as.integer(rows.E),eta=as.double(z),
                     penalty=as.double(1),rank.tol=as.double(rank.tol),
                     nt=as.integer(control$nthreads),use.wy=as.integer(use.wy))		     
      posdef <- oo$n >= 0
      if (!posdef) { ## then problem is indefinite - switch to +ve weights for this step
        if (control$trace) cat("**using positive weights\n")
        # problem is that Fisher can be very poor for zeroes  

        ## index weights that are finite and positive 
        good <- is.finite(dd$Deta2)
        good[good] <- dd$Deta2[good]>0 
        w[!good] <- 0
        wz <- w*(eta-offset) - .5*dd$Deta
        z <- (eta-offset) - dd$Deta.Deta2
        good <- is.finite(z)&is.finite(w) 
        if (sum(!good)) {
          use.wy <- TRUE
          good <- is.finite(w)&is.finite(wz)
          z[!is.finite(z)] <- 0 ## avoid NaN in .C call - unused anyway
        } else use.wy <- family$use.wz
        ng <- sum(good)
        zg[1:ng] <- z[good] ## ensure that y dimension large enough for coefs     
        oo <- .C(C_pls_fit1, ##C_pls_fit1,
                  y=as.double(zg),X=as.double(x[good,]),w=as.double(w[good]),wy = as.double(wz[good]),
                     E=as.double(Sr),Es=as.double(Eb),n=as.integer(ng),
                     q=as.integer(ncol(x)),rE=as.integer(rows.E),eta=as.double(z),
                     penalty=as.double(1),rank.tol=as.double(rank.tol),
                     nt=as.integer(control$nthreads),use.wy=as.integer(use.wy))
      }
      start <- oo$y[1:ncol(x)] ## current coefficient estimates
      penalty <- oo$penalty ## size of penalty

      eta <- drop(x%*%start) ## the linear predictor (less offset)

      if (any(!is.finite(start))) { ## test for breakdown
          conv <- FALSE
          warn[[length(warn)+1]] <- paste(" gam.fit4 non-finite coefficients at iteration ", 
                  iter)
          return(list(REML=NA,warn=warn)) ## return immediately signalling failure
      }        
     
      mu <- linkinv(eta <- eta + offset)
      dev <- sum(dev.resids(y, mu, weights,theta)) 

      ## now step halve under non-finite deviance...
      if (!is.finite(dev)) {
         if (is.null(coefold)) {
            if (is.null(null.coef)) 
              stop("no valid set of coefficients has been found:please supply starting values", 
                    call. = FALSE)
            ## Try to find feasible coefficients from the null.coef and null.eta
            coefold <- null.coef
            etaold <- null.eta
         }
        
         ii <- 1
         while (!is.finite(dev)) {
               if (ii > control$maxit) 
                    stop("inner loop 1; can't correct step size")
               ii <- ii + 1
               start <- (start + coefold)/2
               eta <- (eta + etaold)/2               
               mu <- linkinv(eta)
               dev <- sum(dev.resids(y, mu, weights,theta))
              
         }
         boundary <- TRUE
         penalty <- t(start)%*%St%*%start ## reset penalty too
         if (control$trace) 
                  cat("Step halved: new deviance =", dev, "\n")
      } ## end of infinite deviance correction

      ## now step halve if mu or eta are out of bounds... 
      if (!(valideta(eta) && validmu(mu))) {
       
         ii <- 1
         while (!(valideta(eta) && validmu(mu))) {
                  if (ii > control$maxit) 
                    stop("inner loop 2; can't correct step size")
                  ii <- ii + 1
                  start <- (start + coefold)/2
                  eta <- (eta + etaold)/2 
                  mu <- linkinv(eta)
         }
         boundary <- TRUE
         dev <- sum(dev.resids(y, mu, weights,theta))
         penalty <- t(start)%*%St%*%start ## need to reset penalty too
         if (control$trace) 
                  cat("Step halved: new deviance =", dev, "\n")
      } ## end of invalid mu/eta handling

      ## now check for divergence of penalized deviance....
  
      pdev <- dev + penalty  ## the penalized deviance 
      if (control$trace) cat("penalized deviance =", pdev, "\n")
     
      div.thresh <- 10*(.1+abs(old.pdev))*.Machine$double.eps^.5

      if (pdev-old.pdev>div.thresh) { ## solution diverging
         ii <- 1 ## step halving counter
         if (iter==1) { ## immediate divergence, need to shrink towards zero 
               etaold <- null.eta; coefold <- null.coef
         }
         while (pdev -old.pdev > div.thresh)  { ## step halve until pdev <= old.pdev
           if (ii > 100) 
              stop("inner loop 3; can't correct step size")
           ii <- ii + 1
           start <- (start + coefold)/2 
           eta <- (eta + etaold)/2               
           mu <- linkinv(eta)
           dev <- sum(dev.resids(y, mu, weights,theta))

           penalty <-  t(start)%*%St%*%start
           pdev <- dev + penalty ## the penalized deviance
           if (control$trace) 
                  cat("Step halved: new penalized deviance =", pdev, "\n")
        }
     } ## end of pdev divergence

     if (scoreType=="EFS"&&family$n.theta>0) { ## there are theta parameters to estimate...
       scale1 <- if (!is.null(family$scale)) family$scale else scale
       if (family$n.theta>0||scale<0) theta <- estimate.theta(theta,family,y,mu,scale=scale1,wt=weights,tol=1e-7)
       if (!is.null(family$scale) && family$scale<0) {
	  scale <- exp(theta[family$n.theta+1])
	  theta <- theta[1:family$n.theta]
       }  
       family$putTheta(theta)
     }
     ## get new weights and pseudodata (needed now for grad testing)...
     dd <- dDeta(y,mu,weights,theta,family,0) ## derivatives of deviance w.r.t. eta
     w <- dd$Deta2 * .5;
     wz <- w*(eta-offset) - .5*dd$Deta
     z <- (eta-offset) - dd$Deta.Deta2
     good <- is.finite(z)&is.finite(w) 
     ## convergence testing...
     if (posdef && abs(pdev - old.pdev)/(0.1 + abs(pdev)) < control$epsilon) {
       ## Need to check coefs converged adequately, to ensure implicit differentiation
       ## ok. Testing coefs unchanged is problematic under rank deficiency (not guaranteed to
       ## drop same parameter every iteration!)
       grad <- 2 * t(x[good,,drop=FALSE])%*%((w[good]*(x%*%start)[good]-wz[good]))+ 2*St%*%start 
       if (max(abs(grad)) > control$epsilon*(abs(pdev)+scale)) {
         old.pdev <- pdev  ## not converged quite enough
         coef <- coefold <- start
         etaold <- eta 
         ##muold <- mu
       } else { ## converged
         conv <- TRUE
         coef <- start
         break 
       }
     } else { ## not converged
       old.pdev <- pdev
       coef <- coefold <- start
       etaold <- eta 
     }
     if (scoreType=="EFS"&&family$n.theta>0) { 
       ## now recompute pdev with new theta, otherwise step control won't work at next iteration
       dev <- sum(dev.resids(y, mu, weights,theta))
       old.pdev <- pdev <- dev + penalty
     }
   } ## end of main loop
   
   ## so at this stage the model has been fully estimated
   coef <- as.numeric(T %*% coef)
 
   ## now obtain derivatives, if these are needed...
   check.derivs <- FALSE
   while (check.derivs) { ## debugging code to check derivatives
     eps <- 1e-7
     fmud.test(y,mu,weights,theta,family,eps = eps)
     fetad.test(y,mu,weights,theta,family,eps = eps)
   }   

   dd <- dDeta(y,mu,weights,theta,family,deriv)
   w <- dd$Deta2 * .5
   z <- (eta-offset) - dd$Deta.Deta2 ## - .5 * dd$Deta[good] / w
   wf <- pmax(0,dd$EDeta2 * .5) ## Fisher type weights 
   wz <- w*(eta-offset) - 0.5*dd$Deta ## Wz finite when w==0
  
   good <- is.finite(wz)&is.finite(w)&dd$good
   if (sum(good)==0) stop("not enough finite derivatives")
      
   residuals <- z - (eta - offset)
   residuals[!is.finite(residuals)] <- NA 
   z[!is.finite(z)] <- 0 ## avoid passing NA etc to C code  

   ntot <- length(theta) + length(sp)
   rSncol <- unlist(lapply(UrS,ncol))
   ## Now drop any elements of dd that have been dropped in fitting...
   if (sum(!good)>0) { ## drop !good from fields of dd, weights and pseudodata
     z <- z[good]; w <- w[good]; wz <- wz[good]; wf <- wf[good]
     dd$Deta <- dd$Deta[good];dd$Deta2 <- dd$Deta2[good] 
     dd$EDeta2 <- dd$EDeta2[good]
     if (deriv>0) dd$Deta3 <- dd$Deta3[good]
     if (deriv>1) dd$Deta4 <- dd$Deta4[good]
     if (length(theta)>1) {
         if (deriv>0) {  
         dd$Dth <- dd$Dth[good,]; 
         dd$Detath <- dd$Detath[good,]; dd$Deta2th <- dd$Deta2th[good,]
         if (deriv>1) {  
           dd$Detath2 <- dd$Detath2[good,]; dd$Deta3th <- dd$Deta3th[good,]
           dd$Deta2th2 <- dd$Deta2th2[good,];dd$Dth2 <- dd$Dth2[good,]
         }
       }
     } else {
       if (deriv>0) { 
         dd$Dth <- dd$Dth[good]; 
         dd$Detath <- dd$Detath[good]; dd$Deta2th <- dd$Deta2th[good]
         if (deriv>1) {
           dd$Detath2 <- dd$Detath2[good]; dd$Deta3th <- dd$Deta3th[good]
           dd$Deta2th2 <- dd$Deta2th2[good]; dd$Dth2 <- dd$Dth2[good]
         }
       } 
     }
   }

   ## gdi.type should probably be computed after dropping via good
   
   gdi.type <- if (any(abs(w)<.Machine$double.xmin*1e20)||any(!is.finite(z))) 1 else 0   

   if (scoreType=="EFS") scoreType <- "REML"
   oo <- .C(C_gdi2,
            X=as.double(x[good,]),E=as.double(Sr),Es=as.double(Eb),rS=as.double(unlist(rS)),
            U1 = as.double(U1),sp=as.double(exp(sp)),theta=as.double(theta),
            z=as.double(z),w=as.double(w),wz=as.double(wz),wf=as.double(wf),Dth=as.double(dd$Dth),
            Det=as.double(dd$Deta),
            Det2=as.double(dd$Deta2),Dth2=as.double(dd$Dth2),Det.th=as.double(dd$Detath),
            Det2.th=as.double(dd$Deta2th),Det3=as.double(dd$Deta3),Det.th2 = as.double(dd$Detath2),
            Det4 = as.double(dd$Deta4),Det3.th=as.double(dd$Deta3th), Deta2.th2=as.double(dd$Deta2th2),
            beta=as.double(coef),b1=as.double(rep(0,ntot*ncol(x))),w1=as.double(rep(0,ntot*length(z))),
            D1=as.double(rep(0,ntot)),D2=as.double(rep(0,ntot^2)),
            P=as.double(0),P1=as.double(rep(0,ntot)),P2 = as.double(rep(0,ntot^2)),
            ldet=as.double(1-2*(scoreType=="ML")),ldet1 = as.double(rep(0,ntot)), 
            ldet2 = as.double(rep(0,ntot^2)),
            rV=as.double(rep(0,ncol(x)^2)),
            rank.tol=as.double(.Machine$double.eps^.75),rank.est=as.integer(0),
	    n=as.integer(sum(good)),q=as.integer(ncol(x)),M=as.integer(nSp),
            n.theta=as.integer(length(theta)), Mp=as.integer(Mp),Enrow=as.integer(rows.E),
            rSncol=as.integer(rSncol),deriv=as.integer(deriv),
	    fixed.penalty = as.integer(rp$fixed.penalty),nt=as.integer(control$nthreads),
            type=as.integer(gdi.type),dVkk=as.double(rep(0,nSp^2)))

   rV <- matrix(oo$rV,ncol(x),ncol(x)) ## rV%*%t(rV)*scale gives covariance matrix 
   #rV <- rV # transform before return   
   ## derivatives of coefs w.r.t. sps etc...

   ## note that db.drho and dw.drho start with derivs wrt theta, then wrt sp (no scale param of course) 

   db.drho <- if (deriv) matrix(oo$b1,ncol(x),ntot) else NULL ## transform before return

   dw.drho <- if (deriv) matrix(oo$w1,length(z),ntot) else NULL

   Kmat <- matrix(0,nrow(x),ncol(x)) 
   Kmat[good,] <- oo$X                    ## rV%*%t(K)%*%(sqrt(wf)*X) = F; diag(F) is edf array 

   Vg <- NCV <- NCV1 <- REML <- REML1 <- REML2 <- NULL
   if (scoreType=="NCV") {
     eta.cv <- rep(0.0,length(nei$i))
     deta.cv <- if (deriv) matrix(0.0,length(nei$i),ntot) else matrix(0.0,1,ntot)
     w1 <- -dd$Deta/2; w2 <- dd$Deta2/2; dth <- dd$Detath/2 ## !?
     R <- try(chol(crossprod(x,w*x)+St),silent=TRUE)
     if (nei$jackknife > 2) { ## return NCV coef changes for each fold 
       if (deriv>0) stop("jackknife and derivatives requested together")
       dth <- matrix(0,ncol(x),length(nei$m))
       deriv1 <- -1 ## signal to return coef changes
     } else deriv1 <- deriv
     if (inherits(R,"try-error")) { ## use CG approach...
	Hi <- tcrossprod(rV) ## inverse of penalized Expected Hessian - inverse actual Hessian probably better
        cg.iter <- .Call(C_ncv,x,Hi,w1,w2,db.drho,dw.drho,rS,nei$i-1,nei$mi,nei$m,nei$k-1,oo$beta,exp(sp),eta.cv, deta.cv, dth, deriv1);
	warn[[length(warn)+1]] <- "NCV positive definite update check not possible"
     } else { ## use Cholesky update approach
	pdef.fails <- .Call(C_Rncv,x,R,w1,w2,db.drho,dw.drho,rS,nei$i-1,nei$mi,nei$m,nei$k-1,oo$beta,exp(sp),eta.cv,
	                    deta.cv, dth, deriv1,.Machine$double.eps,control$ncv.threads);
	if (pdef.fails) warn[[length(warn)+1]] <- "some NCV updates not positive definite"
     }   
     mu.cv <- linkinv(eta.cv)
     nt <- family$n.theta
     ## if some elements of theta are fixed, figure out which derivatives will be needed...
     if (deriv) keep <- if (length(theta)>nt) (length(theta)+1):(ncol(db.drho)+!scale.known) else 1:(ncol(db.drho)+!scale.known)
   
     dev0 <- dev.resids(y[nei$i], mu[nei$i], weights[nei$i],theta)
     ls0 <- family$ls(y[nei$i],weights[nei$i],theta,scale)
     if (family$qapprox) { ## quadratic approximation to NCV
       qdev <- dev0 + gamma*dd$Deta[nei$i]*(eta.cv-eta[nei$i]) + 0.5*gamma*dd$Deta2[nei$i]*(eta.cv-eta[nei$i])^2
       NCV <- sum(qdev)/(2*scale) - ls0$ls
       if (deriv) {
         deta <- x %*% db.drho
	 ncv1 <- (dd$Deta[nei$i]*((1-gamma)*deta[nei$i,,drop=FALSE]+gamma*deta.cv) +
	          gamma*dd$Deta2[nei$i]*deta.cv*(eta.cv-eta[nei$i]) +
	          0.5*gamma*as.numeric(dd$Deta3[nei$i])*deta[nei$i,,drop=FALSE]*(eta.cv-eta[nei$i])^2)/(2*scale)
	 if (nt>0) { ## deal with direct dependence on the theta parameters
           ncv1[,1:nt] <- ncv1[,1:nt]- ls0$LSTH1[,1:nt] +
	      if (nt==1) (dd$Dth[nei$i] + gamma*dd$Detath[nei$i]*(eta.cv-eta[nei$i]) + 0.5*gamma*dd$Deta2th[nei$i]*(eta.cv-eta[nei$i])^2)/(2*scale)
	      else (dd$Dth[nei$i,] + gamma*dd$Detath[nei$i,]*(eta.cv-eta[nei$i]) + 0.5*gamma*dd$Deta2th[nei$i,]*(eta.cv-eta[nei$i])^2)/(2*scale)
         }
	 if (!scale.known) {
	   ncv1 <- cbind(ncv1,-qdev/(2*scale) - ls0$LSTH1[,1+nt])
         }
       }
     } else { ## exact NCV
       dev.cv <- dev.resids(y, mu.cv, weights,theta)
       NCV <- sum(dev.cv)/(2*scale) - ls0$ls
       DEV <- sum(dev0)/(2*scale) - ls0$ls 
       if (gamma!=1) NCV <- gamma*NCV - (gamma-1)*DEV
       if (deriv) {
         dd.cv <- dDeta(y[nei$i],mu.cv,weights[nei$i],theta,family,1) 
	 ncv1 <- dd.cv$Deta*deta.cv/(2*scale)
	 if (gamma!=1) dev1 <- (dd$Deta*(x%*%db.drho))[nei$i,,drop=FALSE]/(2*scale)
         if (nt>0) {
       	   ncv1[,1:nt] <- ncv1[,1:nt] + as.matrix(dd.cv$Dth/(2*scale)) - ls0$LSTH1[,1:nt]
	   if (gamma!=1) dev1[,1:nt] <- dev1[,1:nt] + as.matrix(dd$Dth/(2*scale))[nei$i,,drop=FALSE] - ls0$LSTH1[,1:nt]
         }
         if (!scale.known) { ## deal with log scale parameter derivative
	   ncv1 <- cbind(ncv1,-dev.cv/(2*scale) - ls0$LSTH1[,1+nt])
	   if (gamma!=1) dev1 <- cbind(dev1,-dev0/(2*scale) - ls0$lSTH1[,1+nt])
         }
	 if (gamma!=1) ncv1 <- gamma*ncv1 - (gamma-1)*dev1
       }
     } ## exact NCV
     
     if (nei$jackknife>2) { 
       nk <- c(nei$m[1],diff(nei$m)) ## dropped fold sizes
       jkw <- sqrt((nobs-nk)/(nobs*nk)) ## jackknife weights
       dth <-jkw*t(dth)%*%t(T)
       #Vj <- crossprod(dd) ## jackknife cov matrix for coefs (beta)
       #attr(Vj,"dd") <- dd
       #attr(NCV,"Vj") <- Vj
       attr(NCV,"dd") <- dth
     }  

     attr(NCV,"eta.cv") <- eta.cv
     if (deriv) {
       attr(NCV,"deta.cv") <- deta.cv;
       NCV1 <- colSums(ncv1)
       NCV1 <- NCV1[keep] ## drop derivatives for any fixed theta parameters
       Vg <- crossprod(ncv1[,keep,drop=FALSE]) ## empirical cov matrix of grad
     }  
   } else { ## RE/ML
    
     D2 <- matrix(oo$D2,ntot,ntot); ldet2 <- matrix(oo$ldet2,ntot,ntot)
     bSb2 <- matrix(oo$P2,ntot,ntot)
     ## compute the REML score...
     ls <- family$ls(y,weights,theta,scale)
     nt <- length(theta)
     lsth1 <- if (nt>0) ls$lsth1[1:nt] else rep(0,0)
     lsth2 <- if (nt>0) as.matrix(ls$lsth2)[1:nt,1:nt] else matrix(0,0,0) ## exclude any derivs w.r.t log scale here
     REML <- ((dev+oo$P)/(2*scale) - ls$ls)/gamma + (oo$ldet - rp$det)/2 - 
             as.numeric(scoreType=="REML") * Mp * (log(2*pi*scale)/2-log(gamma)/2)
     
     if (deriv) {
       det1 <- oo$ldet1
       if (nSp) {
         ind <- 1:nSp + length(theta)
         det1[ind] <- det1[ind] - rp$det1
       }
       REML1 <- ((oo$D1+oo$P1)/(2*scale) - c(lsth1,rep(0,length(sp))))/gamma + (det1)/2
       if (deriv>1) {
         ls2 <- D2*0;if (nt>0) ls2[1:nt,1:nt] <- lsth2 
         if (nSp) ldet2[ind,ind] <- ldet2[ind,ind] - rp$det2
         REML2 <- ((D2+bSb2)/(2*scale) - ls2)/gamma + ldet2/2
       }
     } 

     if (!scale.known&&deriv) { ## need derivatives wrt log scale, too 
        Dp <- dev + oo$P
        dlr.dlphi <- (-Dp/(2 *scale) - ls$lsth1[nt+1])/gamma - as.numeric(scoreType=="REML") * Mp/2
        d2lr.d2lphi <- (Dp/(2*scale) - ls$lsth2[nt+1,nt+1])/gamma 
        d2lr.dspphi <- -(oo$D1+oo$P1)/(2*scale*gamma) 
        if (nt>0) d2lr.dspphi[1:nt] <- d2lr.dspphi[1:nt] - ls$lsth2[nt+1,1:nt]/gamma
        REML1 <- c(REML1,dlr.dlphi)
        if (deriv==2) {
              REML2 <- rbind(REML2,as.numeric(d2lr.dspphi))
              REML2 <- cbind(REML2,c(as.numeric(d2lr.dspphi),d2lr.d2lphi))
        }
     }
   }
   nth <- length(theta)

   if (deriv) db.drho <- T %*% db.drho

   if (deriv>0&&family$n.theta==0&&nth>0) { ## need to drop derivs for fixed theta
     REML1 <- REML1[-(1:nth)]
     if (deriv>1) REML2 <- REML2[-(1:nth),-(1:nth)]
     db.drho <- db.drho[,-(1:nth),drop=FALSE]
   }  

   names(coef) <- xnames
   names(residuals) <- ynames
   wtdmu <- sum(weights * y)/sum(weights) ## has to then be corrected when this is incorrect
   ## wtdmu <- sum(weights * mu)/sum(weights) ## changed from y
   nulldev <- sum(dev.resids(y, rep(wtdmu,length(y)), weights)) ## this will be corrected in family postproc
   n.ok <- nobs - sum(weights == 0)
   nulldf <- n.ok
   ww <- wt <- rep.int(0, nobs)
   wt[good] <- wf 
   ww[good] <- w
   if (deriv && nrow(dw.drho)!=nrow(x)) {
      w1 <- dw.drho
      dw.drho <- matrix(0,nrow(x),ncol(w1))
      dw.drho[good,] <- w1
   }
   ## dev is used in family$aic to estimate the scale as dev/sum(weights), or is unused.
   ## the form is for compatibility with exponential families, but the estimator can
   ## be biased for low count data in the extended family case. So better to force the
   ## actual scale estimate to be used...
   aic.model <- family$aic(y, mu, theta, weights, dev=scale*sum(weights)) # note: incomplete 2*edf needs to be added
 

   list(coefficients = coef,residuals=residuals,fitted.values = mu,
        family=family, linear.predictors = eta,deviance=dev,
        null.deviance=nulldev,iter=iter,
        weights=wt, ## note that these are Fisher type weights 
        prior.weights=weights,
        working.weights = ww, ## working weights
        df.null = nulldf, y = y, converged = conv,z=z,
        boundary = boundary,
        REML=REML,REML1=REML1,REML2=REML2,NCV=NCV,NCV1=NCV1,
        rV=T %*% rV,db.drho=db.drho,dw.drho=dw.drho,
        scale.est=scale,reml.scale=scale,
        aic=aic.model,
        rank=oo$rank.est,
        K=Kmat,control=control,Vg=Vg,
        dVkk = matrix(oo$dVkk,nSp,nSp),ldetS1 = if (grderiv) rp$det1 else 0
        #,D1=oo$D1,D2=D2,
        #ldet=oo$ldet,ldet1=oo$ldet1,ldet2=ldet2,
        #bSb=oo$P,bSb1=oo$P1,bSb2=bSb2,
        #ls=ls$ls,ls1=ls$lsth1,ls2=ls$lsth2
       )
 
} ## gam.fit4



efsudr <- function(x,y,lsp,Eb,UrS,weights,family,offset=0,start=NULL,etastart=NULL,mustart=NULL,
                   U1=diag(ncol(x)), intercept = TRUE,scale=1,Mp=-1,control=gam.control(),n.true=-1,...) {
## Extended Fellner-Schall method for regular and extended families,
## with PIRLS performed by gam.fit3/4.
## tr(S^-S_j) is returned by ldetS1, rV %*% t(rV)*scale is 
## cov matrix. I think b'S_jb will need to be  computed here.
  nsp <- length(UrS)
  if (inherits(family,"extended.family")) {
    spind <- family$n.theta + 1:nsp
    thind <- if (family$n.theta>0) 1:family$n.theta else rep(0,0)
  } else {
    thind <- rep(0,0)
    spind <- 1:nsp ## index of smoothing params in lsp
  }
  estimate.scale <- (length(lsp)>max(spind))
  lsp[spind] <- lsp[spind] + 2.5 
  mult <- 1
  fit <- gam.fit3(x=x, y=y, sp=lsp, Eb=Eb,UrS=UrS,
            weights = weights, start = start, offset = offset,U1=U1, Mp=Mp, family = family, 
            control = control, intercept = intercept,deriv=0,
            gamma=1,scale=scale,scoreType="EFS",
            n.true=n.true,...)
  if (length(thind)>0) lsp[thind] <- family$getTheta()
  if (estimate.scale) lsp[length(lsp)] <- log(fit$scale)
  ## Also need scale estimate. OK from gam.fit3, but gam.fit4 version probably needs correcting
  ## for edf, as obtained via MLE.
  p <- ncol(x)
  n <- nrow(x)
  score.hist <- rep(0,200)
  
  bSb <- trVS <- rep(0,nsp)
  for (iter in 1:200) {
    start <- fit$coefficients
    Y <- U1[,1:(ncol(U1)-Mp)] ## penalty range space
    ## project coefs and rV to Y, since this is space of UrS[[i]]
    Yb <- drop(t(Y)%*%start) 
    rV <- t(fit$rV) ## so t(rV)%*%rV*scale is cov matrix
    rVY <- rV %*% Y
    ## ith penalty is UrS[[i]]%*%t(UrS[[i]])...
    for (i in 1:length(UrS)) {
      xx <- Yb %*% UrS[[i]] 
      bSb[i] <- sum(xx^2)
      xx <- rVY %*% UrS[[i]]
      trVS[i] <- sum(xx^2)
    }
    edf <- p - sum(trVS*exp(lsp[spind]))
    if (inherits(family,"extended.family")&&estimate.scale) {
      fit$scale <- fit$scale*n/(n-edf) ## correct for edf.
    }

    a <- pmax(0,fit$ldetS1*exp(-lsp[spind]) - trVS) ## NOTE: double check scaling here
    phi <- if (estimate.scale) fit$scale else scale
    r <- a/pmax(0,bSb)*phi
    r[a==0&bSb==0] <- 1
    r[!is.finite(r)] <- 1e6
    lsp1 <- lsp
    lsp1[spind] <- pmin(lsp[spind] + log(r)*mult,control$efs.lspmax)
    max.step <- max(abs(lsp1-lsp))
    old.reml <- fit$REML
    fit <- gam.fit3(x=x, y=y, sp=lsp1, Eb=Eb,UrS=UrS,
            weights = weights, start = start, offset = offset,U1=U1, Mp=Mp, family = family, 
            control = control, intercept = intercept,deriv=0,mustart=mustart,
            gamma=1,scale=scale,scoreType="EFS",
            n.true=n.true,...)
    if (length(thind)>0) lsp1[thind] <- family$getTheta()
    if (estimate.scale) lsp1[length(lsp)] <- log(fit$scale)
    ## some step length control...
   
    if (fit$REML<=old.reml) { ## improvement
      if (max.step<.05) { ## consider step extension (near optimum)
        lsp2 <- lsp
        lsp2[spind] <- pmin(lsp[spind] + log(r)*mult*2,control$efs.lspmax) ## try extending step...
        fit2 <- gam.fit3(x=x, y=y, sp=lsp2, Eb=Eb,UrS=UrS,
            weights = weights, start = start, offset = offset,U1=U1, Mp=Mp, family = family, 
            control = control, intercept = intercept,deriv=0,mustart=mustart,
            gamma=1,scale=scale,scoreType="EFS",
            n.true=n.true,...)
        if (length(thind)>0) lsp2[thind] <- family$getTheta()
        if (estimate.scale) lsp2[length(lsp)] <- log(fit$scale)
        if (fit2$REML < fit$REML) { ## improvement - accept extension
          fit <- fit2;lsp <- lsp2
	  mult <- mult * 2
        } else { ## accept old step
          lsp <- lsp1
        }
      } else lsp <- lsp1
    } else { ## no improvement 
      while (fit$REML > old.reml&&mult>1) { ## don't contract below 1 as update doesn't have to improve REML 
          mult <- mult/2 ## contract step
	  lsp1 <- lsp
          lsp1[spind] <- pmin(lsp[spind] + log(r)*mult,control$efs.lspmax)
	  fit <- gam.fit3(x=x, y=y, sp=lsp1, Eb=Eb,UrS=UrS,
            weights = weights, start = start, offset = offset,U1=U1, Mp=Mp, family = family, 
            control = control, intercept = intercept,deriv=0,mustart=mustart,
            gamma=1,scale=scale,scoreType="EFS",
            n.true=n.true,...)
	  if (length(thind)>0) lsp1[thind] <- family$getTheta()
          if (estimate.scale) lsp1[length(lsp)] <- log(fit$scale)
      }
      lsp <- lsp1
      if (mult<1) mult <- 1
    }
    score.hist[iter] <- fit$REML
    ## break if EFS step small and REML change negligible over last 3 steps.
    if (iter>3 && max.step<.05 && max(abs(diff(score.hist[(iter-3):iter])))<control$efs.tol) break
    ## or break if deviance not changing...
    if (iter==1) old.dev <- fit$dev else {
      if (abs(old.dev-fit$dev) < 100*control$eps*abs(fit$dev)) break
      old.dev <- fit$dev
    }
  }
  fit$sp <- exp(lsp)
  fit$iter <- iter
  fit$outer.info <- list(iter = iter,score.hist=score.hist[1:iter])
  fit$outer.info$conv <- if (iter==200) "iteration limit reached" else "full convergence"
  fit
} ## efsudr


gam.fit5 <- function(x,y,lsp,Sl,weights=NULL,offset=NULL,deriv=2,family,scoreType="REML",
                     control=gam.control(),Mp=-1,start=NULL,gamma=1,nei=NULL){
## NOTE: offset handling - needs to be passed to ll code
## fit models by general penalized likelihood method, 
## given doubly extended family in family. lsp is log smoothing parameters
## Stabilization strategy:
## 1. Sl.repara
## 2. Hessian diagonally pre-conditioned if +ve diagonal elements
##    (otherwise indefinite anyway)
## 3. Newton fit with perturbation of any indefinite hessian
## 4. At convergence test fundamental rank on balanced version of 
##    penalized Hessian. Drop unidentifiable parameters and 
##    continue iteration to adjust others.
## 5. All remaining computations in reduced space.
##    
## Idea is that rank detection takes care of structural co-linearity,
## while preconditioning and step 1 take care of extreme smoothing parameters
## related problems. 

  penalized <- if (length(Sl)>0) TRUE else FALSE
  warn <- list()
  nSp <- length(lsp)
  q <- ncol(x)
  nobs <- length(y)
  
  if (penalized) {
    Eb <- attr(Sl,"E") ## balanced penalty sqrt
 
    ## the stability reparameterization + log|S|_+ and derivs... 
    rp <- ldetS(Sl,rho=lsp,fixed=rep(FALSE,length(lsp)),np=q,root=TRUE) 
    x <- Sl.repara(rp$rp,x) ## apply re-parameterization to x
    Eb <- Sl.repara(rp$rp,Eb) ## root balanced penalty
    St <- crossprod(rp$E) ## total penalty matrix
    E <- rp$E ## root total penalty
    attr(E,"use.unscaled") <- TRUE ## signal initialization code that E not to be further scaled   
    if (!is.null(start)) start  <- Sl.repara(rp$rp,start) ## re-para start
    ## NOTE: it can be that other attributes need re-parameterization here
    ##       this should be done in 'family$initialize' - see mvn for an example. 

  } else { ## unpenalized so no derivatives required
    deriv <- 0 
    rp <- list(ldetS=0,rp=list())
    St <- matrix(0,q,q)
    E <- matrix(0,0,q) ## can be needed by initialization code
  }
  ## now call initialization code, but make sure that any 
  ## supplied 'start' vector is not overwritten...
  start0 <- start
  
  ## Assumption here is that the initialization code is fine with
  ##  re-parameterized x...

  eval(family$initialize) 
   
  if (!is.null(start0)) start <- start0 
  coef <- as.numeric(start)

  if (is.null(weights)) weights <- rep.int(1, nobs)
  if (is.null(offset)) offset <- rep.int(0, nobs)
 
  ## get log likelihood, grad and Hessian (w.r.t. coefs - not s.p.s) ...
  llf <- family$ll
  ll <- llf(y,x,coef,weights,family,offset=offset,deriv=1) 
  ll0 <- ll$l - drop(t(coef)%*%St%*%coef)/2
  grad <- ll$lb - St%*%coef
  iconv <- max(abs(grad))<control$epsilon*abs(ll0)
  Hp <- -ll$lbb+St
  rank.checked <- FALSE ## not yet checked the intrinsic rank of problem 
  rank <- q
  eigen.fix <- FALSE
  converged <- FALSE
  check.deriv <- FALSE; eps <- 1e-5 
  drop <- NULL;bdrop <- rep(FALSE,q) ## by default nothing dropped
  perturbed <- 0 ## counter for number of times perturbation tried on possible saddle

  for (iter in 1:(2*control$maxit)) { ## main iteration
    ## get Newton step... 
    if (check.deriv) { ## code for checking derivatives when debugging
      fdg <- ll$lb*0; fdh <- ll$lbb*0
      for (k in 1:length(coef)) {
        coef1 <- coef;coef1[k] <- coef[k] + eps
        ll.fd <- llf(y,x,coef1,weights,family,offset=offset,deriv=1)
        fdg[k] <- (ll.fd$l-ll$l)/eps
        fdh[,k] <- (ll.fd$lb-ll$lb)/eps
      }
    } ## derivative checking end
    #grad <- ll$lb - St%*%coef 
    #Hp <- -ll$lbb+St
    kappaH <- kappa(Hp)
    D <- diag(Hp)
    if (sum(!is.finite(D))>0) stop("non finite values in Hessian")

    if (min(D)<=0) { ## 2/2/19 replaces any D<0 indicating indef
      Dthresh <- max(D)*sqrt(.Machine$double.eps) 
      if (-min(D) < Dthresh) { ## could be indef or +ve semi def
        indefinite <- FALSE
	D[D<Dthresh] <- Dthresh
      } else indefinite <- TRUE ## min D too negative to be semi def
    } else indefinite <- FALSE ## On basis of D could be +ve def
    
    if (indefinite) { ## Hessian indefinite, for sure
      ## D <- rep(1,ncol(Hp)) # moved to later otherwise Ip/Ib pointless 2/2/19
      if (eigen.fix) {
        eh <- eigen(Hp,symmetric=TRUE);
        ev <- abs(eh$values)
        Hp <- eh$vectors%*%(ev*t(eh$vectors))
      } else {
        Ib <- diag(rank)*abs(min(D))
        Ip <- diag(rank)*abs(max(D)*.Machine$double.eps^.5)
        Hp <- Hp  + Ip + Ib
      }
      D <- rep(1,ncol(Hp)) ## 2/2/19
      indefinite <- TRUE
    } else { ## Hessian could be +ve def in which case Choleski is cheap!
      D <- D^-.5 ## diagonal pre-conditioner
      Hp <- D*t(D*Hp) ## pre-condition Hp
      Ip <- diag(rank)*.Machine$double.eps^.5   
    }
    L <- suppressWarnings(chol(Hp,pivot=TRUE))
    mult <- 1
    while (attr(L,"rank") < rank) { ## rank deficient - add ridge penalty 
      if (eigen.fix) {
        eh <- eigen(Hp,symmetric=TRUE);ev <- eh$values
        thresh <- max(min(ev[ev>0]),max(ev)*1e-6)*mult
        mult <- mult*10
        ev[ev<thresh] <- thresh
        Hp <- eh$vectors%*%(ev*t(eh$vectors)) 
        L <- suppressWarnings(chol(Hp,pivot=TRUE))
      } else {
        L <- suppressWarnings(chol(Hp+Ip,pivot=TRUE))
        Ip <- Ip * 100 ## increase regularization penalty
      }
      indefinite <- TRUE
    }

    piv <- attr(L,"pivot")
    ipiv <- piv;ipiv[piv] <- 1:ncol(L)

    if (converged) break ## converged at end of previous step, so break now L and D match Hp
    
    #if (iconv&&!indefinite) break ## immediate convergence - bad idea - can lead to small sp changes having zero effect on objective.
    
    step <- D*(backsolve(L,forwardsolve(t(L),(D*grad)[piv]))[ipiv])

    c.norm <- sum(coef^2)
    if (c.norm>0) { ## limit step length to .1 of coef length
      s.norm <- sqrt(sum(step^2))
      c.norm <- sqrt(c.norm)
      if (s.norm > .1*c.norm) step <- step*0.1*c.norm/s.norm
    }
    s.norm <- sqrt(sum(step^2)) ## store for scaling steepest if needed
    ## try the Newton step...
    coef1 <- coef + step 
    ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=1) 
    ll1 <- ll$l - drop(t(coef1)%*%St%*%coef1)/2
    khalf <- 0;fac <- 2

    ## with ll1 < ll0 in place of ll1 <= ll0 in next line than we can repeatedly accept
    ## miniscule steps that do not actually improve anything.
    llold <- ll ## avoid losing lbb slot on stp failure
    while ((!is.finite(ll1)||ll1 <= ll0) && khalf < 25) { ## step halve until it succeeds...
      step <- step/fac;coef1 <- coef + step
      ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=0)
      ll1 <- ll$l - (t(coef1)%*%St%*%coef1)/2
      if (is.finite(ll1)&&ll1>=ll0) { ## improvement, or at least no worse.
        ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=1)
      }
      ## abort if step has made no difference...
      if (max(abs(coef-coef1))<max(abs(coef))*.Machine$double.eps) khalf <- 100
      khalf <- khalf + 1
      if (khalf>5) fac <- 5
    } ## end step halve

    ## Following tries steepest ascent if Newton failed. Only do this if not initially converged.
    ## Otherwise it is possible for SA to mess up when routine is called
    ## with converged parameter values - takes a tiny step that leads to a machine noise
    ## improvement, but pushes a gradient just over threshold, Newton then continues to fail
    ## and we get stuck with SA convergence rates.
    ## One reason to try SA (if grad not already zero) is that Newton step can become
    ## poor due to numerical conditioning problems, whereas SA is largely unaffected.

    if (!is.finite(ll1) || (ll1<=ll0 && !iconv)) { ## switch to steepest ascent
      #step <- 0.5*drop(grad)*mean(abs(coef))/mean(abs(grad))
      step <- drop(grad)*s.norm/sqrt(sum(grad^2)) ## scaled to initial Newton step length
      khalf <- 0
    }

    while ((!is.finite(ll1)||(ll1 <= ll0 && !iconv)) && khalf < 25) { ## step cut until it succeeds...
      step <- step/10;coef1 <- coef + step
      ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=0)
      ll1 <- ll$l - (t(coef1)%*%St%*%coef1)/2
      if (is.finite(ll1)&&ll1>=ll0) { ## improvement, or no worse
        ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=1)
      }
      ## abort if step has made no difference...
      if (max(abs(coef-coef1))<max(abs(coef))*.Machine$double.eps) khalf <- 100 ## step gone nowhere
      khalf <- khalf + 1
    }

    if ((is.finite(ll1)&&ll1 >= ll0&&khalf<25)||iter==control$maxit) { ## step ok. Accept and test
      coef <- coef + step
      grad <- ll$lb - St%*%coef
      Hp <- -ll$lbb+St
      ## convergence test...
      ok <- (iter==control$maxit || max(abs(grad)) < control$epsilon*abs(ll0))
      if (ok) { ## appears to have converged
        if (indefinite) { ## not a well defined maximum
          if (perturbed==5) stop("indefinite penalized likelihood in gam.fit5 ")
          if (iter<4||rank.checked) {
            perturbed <- perturbed + 1
            coef <- coef*(1+(rep_len(c(0,1),length(coef))*.02-.01)*perturbed) + 
                    (rep_len(c(0,1),length(coef)) - 0.5 ) * mean(abs(coef))*1e-5*perturbed 
            ll <- llf(y,x,coef,weights,family,offset=offset,deriv=1) 
            ll0 <- ll$l - (t(coef)%*%St%*%coef)/2
          } else {        
            rank.checked <- TRUE
            if (penalized) {
              Sb <- crossprod(Eb) ## balanced penalty
              Hb <- -ll$lbb/norm(ll$lbb,"F")+Sb/norm(Sb,"F") ## balanced penalized hessian
            } else Hb <- -ll$lbb/norm(ll$lbb,"F")
            ## apply pre-conditioning, otherwise badly scaled problems can result in
            ## wrong coefs being dropped...
            D <- abs(diag(Hb))
            D[D<1e-50] <- 1;D <- D^-.5
            Hb <- t(D*Hb)*D
            qrh <- qr(Hb,LAPACK=TRUE)
            rank <- Rrank(qr.R(qrh))
            if (rank < q) { ## rank deficient. need to drop and continue to adjust other params
              drop <- sort(qrh$pivot[(rank+1):q]) ## set these params to zero 
              bdrop <- 1:q %in% drop ## TRUE FALSE version
              ## now drop the parameters and recompute ll0...
              lpi <- attr(x,"lpi")
              xat <- attributes(x)
              xat$dim <- xat$dimnames <- NULL
              coef <- coef[-drop]
              St <- St[-drop,-drop]
              x <- x[,-drop] ## dropping columns from model matrix
              if (!is.null(lpi)) { ## need to adjust column indexes as well
                ii <- (1:q)[!bdrop];ij <- rep(NA,q)
                ij[ii] <- 1:length(ii) ## col i of old model matrix is col ij[i] of new 
               
                for (i in 1:length(lpi)) {
                  lpi[[i]] <- ij[lpi[[i]][!(lpi[[i]]%in%drop)]] # drop and shuffle up
                }
              } ## lpi adjustment done
              if (length(xat)>0) for (i in 1:length(xat)) attr(x,names(xat)[i]) <- xat[[i]]
              attr(x,"lpi") <- lpi
              attr(x,"drop") <- drop ## useful if family has precomputed something from x
              ll <- llf(y,x,coef,weights,family,offset=offset,deriv=1) 
              ll0 <- ll$l - (t(coef)%*%St%*%coef)/2
	      grad <- ll$lb - St%*%coef
              Hp <- -ll$lbb+St
            } 
          }

        } else { ## not indefinite really converged
          converged <- TRUE
          # ... don't break until L and D made to match final Hp
        }
      } else ll0 <- ll1 ## step ok but not converged yet
    } else { ## step failed.
      ll <- llold ## restore old ll with lbb slot
      if (is.null(drop)) bdrop <- rep(FALSE,q)
      if (iconv && iter==1) { ## OK to fail on first step if apparently converged to start with
        converged <- TRUE     ## Note: important to check if improvement possible even if apparently
	coef <- start         ## converged, otherwise sp changes can lead to no sp objective change!
      } else {
        converged  <- FALSE
	## NOTE: the threshold can be unrealistic if gradient or step can't be computed to this accuracy, e.g.
	## because of very large smoothing parameters - could estimate grad/step accuracy from machine zero
	## perturbation of grad/step calc, but perhaps too fussy.
	coefp <- coef*(1+rep_len(c(-1,1),length(coef))*.Machine$double.eps^.9)
	llp <- llf(y,x,coef,weights,family,offset=offset,deriv=1)
	gradp <- llp$lb - St%*%coefp
	err <- min(1e-3,kappaH*max(1,mean(abs(gradp-grad))/mean(abs(coefp-coef)))*.Machine$double.eps)
	## err is an estimate of the acheivable relative error, capped above at 1e-3 to ensure
	## this level of stability loss gets reported!
        if (max(abs(grad/drop(ll0)))>max(err,control$epsilon*2)) warn[[length(warn)+1]] <-
	  paste("gam.fit5 step failed: max magnitude relative grad =",max(abs(grad/drop(ll0))))
      }
      break ## no need to recompute L and D, so break now
    }
  } ## end of main fitting iteration

  ## at this stage the Hessian (of pen lik. w.r.t. coefs) should be +ve semi definite,
  ## so that the pivoted Choleski factor should exist...
  
  if (iter == 2*control$maxit&&converged==FALSE) 
    warn[[length(warn)+1]] <- gettextf("gam.fit5 iteration limit reached: max abs grad = %g",max(abs(grad)))

  ldetHp <- 2*sum(log(diag(L))) - 2 * sum(log(D)) ## log |Hp|

  if (!is.null(drop)) { ## create full version of coef with zeros for unidentifiable 
    fcoef <- rep(0,length(bdrop));fcoef[!bdrop] <- coef
  } else fcoef <- coef

  dVkk <- d1l <- d2l <- d1bSb <- d2bSb <- d1b <- d2b <- d1ldetH <- d2ldetH <- d1b <- d2b <- NULL
  ncv <- scoreType=="NCV"
  if (deriv>0) {  ## Implicit differentiation for derivs...

    m <- nSp
    d1b <- matrix(0,rank,m)
    Sib <- Sl.termMult(rp$Sl,fcoef,full=TRUE) ## list of penalties times coefs
    if (nSp) for (i in 1:m) d1b[,i] <- 
       -D*(backsolve(L,forwardsolve(t(L),(D*Sib[[i]][!bdrop])[piv]))[ipiv])
  
    ## obtain the curvature check matrix...
    dVkk <- crossprod(L[,ipiv]%*%(d1b/D))

    if (!is.null(drop)) { ## create full version of d1b with zeros for unidentifiable 
      fd1b <-  matrix(0,q,m)
      fd1b[!bdrop,] <- d1b
    } else fd1b <- d1b

    ## Now call the family again to get first derivative of Hessian w.r.t
    ## smoothing parameters, in list d1H. Alternatively, if deriv==1 and !ncv
    ## then tr(Hi d1H) returne as a vector in 'd1H'.

    ll <- if (ncv) llf(y,x,coef,weights,family,offset=offset,deriv=3,d1b=d1b,ncv=TRUE) else
                   llf(y,x,coef,weights,family,offset=offset,deriv=2+(deriv>1),d1b=d1b,fh=D*t(D*chol2inv(L)[ipiv,ipiv]))
    # d1l <- colSums(ll$lb*d1b) # cancels
    

    if (deriv>1) { ## Implicit differentiation for the second derivatives is now possible...

      d2b <- matrix(0,rank,m*(m+1)/2)
      k <- 0
      for (i in 1:m) for (j in i:m) {
        k <- k + 1
        v <- -ll$d1H[[i]]%*%d1b[,j] + Sl.mult(rp$Sl,fd1b[,j],i)[!bdrop] + Sl.mult(rp$Sl,fd1b[,i],j)[!bdrop]
        d2b[,k] <- -D*(backsolve(L,forwardsolve(t(L),(D*v)[piv]))[ipiv])
        if (i==j) d2b[,k] <- d2b[,k] + d1b[,i]
      } 
  
      ## Now call family for last time to get trHid2H the tr(H^{-1} d^2 H / drho_i drho_j)...

      llr <- llf(y,x,coef,weights,family,offset=offset,deriv=4,d1b=d1b,d2b=d2b,
                       Hp=Hp,rank=rank,fh = L,D=D)

      ## Now compute Hessian of log lik w.r.t. log sps using chain rule
       
      # d2la <- colSums(ll$lb*d2b) # cancels
      # k <- 0
      d2l <- matrix(0,m,m)
      for (i in 1:m) for (j in i:m) {
        # k <- k + 1
        d2l[j,i] <- d2l[i,j] <- # d2la[k] + # cancels
	                        t(d1b[,i])%*%ll$lbb%*%d1b[,j] 
      }
    } ## if (deriv > 1)
  } ## if (deriv > 0)

  Vg <- NULL ## jackknife grad matrix in NCV case

  if (scoreType=="NCV") {
    REML <- REML1 <- REML2 <- NULL
    if (deriv==0) ll <- llf(y,x,coef,weights,family,offset=offset,deriv=1,d1b=d1b,ncv=TRUE) ## otherwise l1, l2 not returned
    ncv <- family$ncv ## helps debugging!
    deriv1 <-  if (nei$jackknife>2) -1 else if (deriv==0) 0 else  1
    ## create nei if null - now in estimate.gam
    #if (is.null(nei)||is.null(nei$k)||is.null(nei$m)) nei <- list(i=1:nobs,mi=1:nobs,m=1:nobs,k=1:nobs) ## LOOCV
    #if (is.null(nei$i)) if (length(nei$m)==nobs) nei$mi <- nei$i <- 1:nobs else stop("unclear which points NCV neighbourhoods belong to")
    #if (length(nei$mi)!=length(nei$m)) stop("for NCV number of dropped and predicted neighbourhoods must match")
    ## complete dH
    if (deriv>0) {
      for (i in 1:length(ll$d1H)) ll$d1H[[i]] <- ll$d1H[[i]] - Sl.mult(rp$Sl,diag(q),i)[!bdrop,!bdrop] 
    }
  
    R1 <- try(chol(t(Hp/D)/D),silent=TRUE)
    ll$gamma <- gamma;
    ## note: use of quadratic approx to NCV signalled by family$qapprox
    #if (overlap||inherits(R1,"try-error"))
    if (inherits(R1,"try-error")) {
      ## get H (Hp?) and Hi
      Hi <- t(D*chol2inv(L)[ipiv,ipiv])*D
      ret <- ncv(x,y,weights,nei,coef,family,ll,H=t(Hp/D)/D,Hi=Hi,offset=offset,dH=ll$d1H,
                 db=d1b,deriv=deriv1)
      warn[[length(warn)+1]] <- "NCV update positive definite check not possible"
    } else { ## cholesky version
      ret <- ncv(x,y,weights,nei,coef,family,ll,R=R1,offset=offset,dH=ll$d1H,db=d1b,
                 deriv=deriv1,nt=control$ncv.threads)
      if (ret$error>0) warn[[length(warn)+1]] <- "some NCV updates not positive definite" 		 
    }
    NCV <- ret$NCV
    NCV1 <- ret$NCV1
    Vg <- ret$Vg
    if (deriv1<0) { ## Jackknife cov matrix...
      nk <- c(nei$m[1],diff(nei$m)) ## dropped fold sizes
      jkw <- sqrt((nobs-nk)/(nobs*nk)) ## jackknife weights
      dd <- attr(ret$NCV,"deta.cv")
      #dd <-jkw*t(dd)%*%t(T)
      dd <- Sl.repa(rp$rp,t(dd),l=-1) ## undo repara
      dd <- Sl.inirep(Sl,dd,l=1) ## undo initial repara
      #Vj <- tcrossprod(dd) ## jackknife cov matrix
      #attr(NCV,"Vj") <- Vj
      attr(NCV,"dd") <- jkw*t(dd)
    }
  } else { ## REML required
    NCV <- NCV1 <- NULL
    ## Compute the derivatives of log|H+S|... 
    if (deriv > 0) { ## only first derivative wrt rho required...
      if (deriv==1&&!is.list(ll$d1H)) {
        d1ldetH <- -ll$d1H
        for (i in 1:m) {
          A <- Sl.mult(rp$Sl,diag(q),i,full=TRUE)[!bdrop,!bdrop]
          bind <- rowSums(abs(A))!=0 ## row/cols of non-zero block
          A <- A[,bind,drop=FALSE] ## drop the zero columns  
          A <- D*(backsolve(L,forwardsolve(t(L),(D*A)[piv,]))[ipiv,,drop=FALSE])
          d1ldetH[i] <- d1ldetH[i] + sum(diag(A[bind,,drop=FALSE]))
        }
      } else { ## deriv>1 || is.list(ll$d1H)
        ## computation of d1ldetH using full d1H derivative of Hessian w.r.t. sp matrices...
        d1ldetH <- rep(0,m)
        d1Hp <- list()
        for (i in 1:m) {
          A <- -ll$d1H[[i]] + Sl.mult(rp$Sl,diag(q),i)[!bdrop,!bdrop]
          d1Hp[[i]] <- D*(backsolve(L,forwardsolve(t(L),(D*A)[piv,]))[ipiv,,drop=FALSE])  
          d1ldetH[i] <- sum(diag(d1Hp[[i]]))
        }
      }	
    } ## if (deriv >0)

    if (deriv > 1) {
      ## second derivatives...
      d2ldetH <- matrix(0,m,m)
      k <- 0
      for (i in 1:m) for (j in i:m) {
        k <- k + 1
        d2ldetH[i,j] <- -sum(d1Hp[[i]]*t(d1Hp[[j]])) - llr$trHid2H[k] 
        if (i==j) { ## need to add term relating to smoothing penalty
          A <- Sl.mult(rp$Sl,diag(q),i,full=TRUE)[!bdrop,!bdrop]
          bind <- rowSums(abs(A))!=0 ## row/cols of non-zero block
          A <- A[,bind,drop=FALSE] ## drop the zero columns  
          A <- D*(backsolve(L,forwardsolve(t(L),(D*A)[piv,]))[ipiv,,drop=FALSE])
          d2ldetH[i,j] <- d2ldetH[i,j] + sum(diag(A[bind,,drop=FALSE]))
        } else d2ldetH[j,i] <- d2ldetH[i,j]
      }
    } ## if (deriv > 1)

    ## Compute derivs of b'Sb...

    if (deriv>0) {
      Skb <- Sl.termMult(rp$Sl,fcoef,full=TRUE)
      d1bSb <- rep(0,m)
      for (i in 1:m) { 
        Skb[[i]] <- Skb[[i]][!bdrop]
        d1bSb[i] <- sum(coef*Skb[[i]])
      }
    }
 
    if (deriv>1) {
      d2bSb <- matrix(0,m,m)
      for (i in 1:m) {
        Sd1b <- St%*%d1b[,i] 
        for (j in i:m) {
          d2bSb[j,i] <- d2bSb[i,j] <- 2*sum( 
          d1b[,i]*Skb[[j]] + d1b[,j]*Skb[[i]] + d1b[,j]*Sd1b)
        }
        d2bSb[i,i] <-  d2bSb[i,i] + sum(coef*Skb[[i]]) 
      }
    }

    ## get grad and Hessian of REML score...
    REML <- -as.numeric((ll$l - drop(t(coef)%*%St%*%coef)/2)/gamma + rp$ldetS/2  - ldetHp/2  +
             Mp*(log(2*pi)/2)-log(gamma)/2)
 
    REML1 <- if (deriv<1) NULL else -as.numeric( # d1l # cancels
                                   - d1bSb/(2*gamma) + rp$ldet1/2  - d1ldetH/2 ) 
    REML2 <- if (deriv<2) NULL else -( (d2l - d2bSb/2)/gamma + rp$ldet2/2  - d2ldetH/2 ) 
  } ## REML computations
  
  if (control$trace) {
    cat("\niter =",iter,"  ll =",ll$l,"  REML =",REML,"  bSb =",t(coef)%*%St%*%coef/2,"\n")
    cat("log|S| =",rp$ldetS,"  log|H+S| =",ldetHp,"  n.drop =",length(drop),"\n")
    if (!is.null(REML1)) cat("REML1 =",REML1,"\n")
  }
  

  ## Get possibly multiple linear predictors
  lpi <- attr(x,"lpi")
  if (is.null(lpi)) { ## only one...
    linear.predictors <- if (is.null(offset)) as.numeric(x%*%coef) else as.numeric(x%*%coef+offset)
    fitted.values <- family$linkinv(linear.predictors) 
  } else { ## multiple...
    fitted.values <- linear.predictors <- matrix(0,nrow(x),length(lpi))
    if (!is.null(offset)) offset[[length(lpi)+1]] <- 0
    for (j in 1:length(lpi)) {
      linear.predictors[,j] <- as.numeric(x[,lpi[[j]],drop=FALSE] %*% coef[lpi[[j]]])
      if (!is.null(offset[[j]])) linear.predictors[,j] <-  linear.predictors[,j] + offset[[j]]
      fitted.values[,j] <- family$linfo[[j]]$linkinv( linear.predictors[,j]) 
    }
  }
  coef <- Sl.repara(rp$rp,fcoef,inverse=TRUE) ## undo re-parameterization of coef 
  #coef <- Sl.repa(rp$rp,fcoef,l=-1)
  if (!is.null(drop)&&!is.null(d1b)) { ## create full version of d1b with zeros for unidentifiable 
    db.drho <- matrix(0,length(bdrop),ncol(d1b));db.drho[!bdrop,] <- d1b
  } else db.drho <- d1b
  ## and undo re-para...
  #if (!is.null(d1b)) db.drho <- t(Sl.repara(rp$rp,t(db.drho),inverse=TRUE,both.sides=FALSE)) ## wrong
  if (!is.null(d1b)) db.drho <- Sl.repa(rp$rp,db.drho,l=-1)
  #if (!is.null(d2b)) d2b <- Sl.repa(rp$rp,d2b,l=-1) ## NOTE: DEBUG only
  ## Following needed for debugging H derivatives if Cholesky stabilization used
  #if (!is.null(ll$d1H)) for (i in 1:length(ll$d1H)) ll$d1H[[i]] <- Sl.repa(rp$rp,ll$d1H[[i]],l=2,r=1) ## debug
  #ll$lbb <- Sl.repa(rp$rp,ll$lbb,l=2,r=1) ## debug

  ret <- list(coefficients=coef,family=family,y=y,prior.weights=weights,
       fitted.values=fitted.values, linear.predictors=linear.predictors,
       scale.est=1, ### NOTE: needed by newton, but what is sensible here? 
       REML= REML,REML1= REML1,REML2=REML2,NCV=NCV,NCV1=NCV1,
       rank=rank,aic = -2*ll$l, ## 2*edf needs to be added
       ##deviance = -2*ll$l,
       l= ll$l,## l1 =d1l,l2 =d2l,
       lbb = ll$lbb, ## Hessian of log likelihood
       L=L, ## chol factor of pre-conditioned penalized hessian
       bdrop=bdrop, ## logical index of dropped parameters
       D=D, ## diagonal preconditioning matrix
       St=St, ## total penalty matrix
       rp = rp$rp,Vg=Vg, 
       db.drho = db.drho, ## derivative of penalty coefs w.r.t. log sps.
       #bSb = bSb, bSb1 =  d1bSb,bSb2 =  d2bSb,
       S1=rp$ldet1,
       #S=rp$ldetS,S1=rp$ldet1,S2=rp$ldet2,
       #Hp=ldetHp,Hp1=d1ldetH,Hp2=d2ldetH,
       #b2 = d2b,
       iter=iter,H = ll$lbb,dH = ll$d1H,dVkk=dVkk,warn=warn)#,d2H=llr$d2H)
    ## debugging code to allow components of 2nd deriv of hessian w.r.t. sp.s 
    ## to be passed to deriv.check.... 
    #if (!is.null(ll$ghost1)&&!is.null(ll$ghost2)) { 
    #  ret$ghost1 <- ll$ghost1; ret$ghost2 <- ret$ghost2
    #} 
    ret
} ## end of gam.fit5

efsud <- function(x,y,lsp,Sl,weights=NULL,offset=NULL,family,
                     control=gam.control(),Mp=-1,start=NULL) {
## Extended Fellner-Schall method for general families
## tr(S^-S_j) is returned by ldetS as ldet1 - S1 from gam.fit5
## b'S_jb is computed as d1bSb in gam.fit5
## tr(V S_j) will need to be computed using Sl.termMult
##   Sl returned by ldetS and Vb computed as in gam.fit5.postproc.
  tol <- 1e-6
  lsp <- lsp + 2.5
  mult <- 1
  fit <- gam.fit5(x=x,y=y,lsp=lsp,Sl=Sl,weights=weights,offset=offset,deriv=0,family=family,
                     control=control,Mp=Mp,start=start,gamma=1)
  score.hist <- rep(0,200)
  tiny <- .Machine$double.eps^.5 ## used to bound above zero
  for (iter in 1:200) {
    start <- fit$coefficients
    ## obtain Vb...
    ipiv <- piv <- attr(fit$L,"pivot")
    p <- length(piv)
    ipiv[piv] <- 1:p
    Vb <- crossprod(forwardsolve(t(fit$L),diag(fit$D,nrow=p)[piv,,drop=FALSE])[ipiv,,drop=FALSE])
    if (sum(fit$bdrop)) { ## some coefficients were dropped...
      q <- length(fit$bdrop)
      ibd <- !fit$bdrop
      Vtemp <- Vb; Vb <- matrix(0,q,q)
      Vb[ibd,ibd] <- Vtemp
    }
    Vb <- Sl.repara(fit$rp,Vb,inverse=TRUE)
    SVb <- Sl.termMult(Sl,Vb) ## this could be made more efficient
    trVS <- rep(0,length(SVb))
    for (i in 1:length(SVb)) {
      ind <- attr(SVb[[i]],"ind")
      trVS[i] <- sum(diag(SVb[[i]][,ind]))
    }
    Sb <- Sl.termMult(Sl,start,full=TRUE)
    bSb <- rep(0,length(Sb))
    for (i in 1:length(Sb)) {
      bSb[i] <- sum(start*Sb[[i]])
    }
   
    a <- pmax(tiny,fit$S1*exp(-lsp) - trVS)
    r <- a/pmax(tiny,bSb)
    r[a==0&bSb==0] <- 1
    r[!is.finite(r)] <- 1e6
    lsp1 <- pmin(lsp + log(r)*mult,control$efs.lspmax)
    max.step <- max(abs(lsp1-lsp))
    old.reml <- fit$REML
    fit <- gam.fit5(x=x,y=y,lsp=lsp1,Sl=Sl,weights=weights,offset=offset,deriv=0,
                    family=family,control=control,Mp=Mp,start=start,gamma=1)
    ## some step length control...
   
    if (fit$REML<=old.reml) { ## improvement
      if (max.step<.05) { ## consider step extension
        lsp2 <- pmin(lsp + log(r)*mult*2,12) ## try extending step...
        fit2 <- gam.fit5(x=x,y=y,lsp=lsp2,Sl=Sl,weights=weights,offset=offset,deriv=0,family=family,
                     control=control,Mp=Mp,start=start,gamma=1)
     
        if (fit2$REML < fit$REML) { ## improvement - accept extension
          fit <- fit2;lsp <- lsp2
	  mult <- mult * 2
        } else { ## accept old step
          lsp <- lsp1
        }
      } else lsp <- lsp1
    } else { ## no improvement 
      while (fit$REML > old.reml&&mult>1) { ## don't contract below 1 as update doesn't have to improve REML 
          mult <- mult/2 ## contract step
          lsp1 <- pmin(lsp + log(r)*mult,control$efs.lspmax)
	  fit <- gam.fit5(x=x,y=y,lsp=lsp1,Sl=Sl,weights=weights,offset=offset,deriv=0,family=family,
                        control=control,Mp=Mp,start=start,gamma=1)
      }
      lsp <- lsp1
      if (mult<1) mult <- 1
    }
    score.hist[iter] <- fit$REML
    ## break if EFS step small and REML change negligible over last 3 steps.
    if (iter>3 && max.step<.05 && max(abs(diff(score.hist[(iter-3):iter])))<control$efs.tol) break
    ## or break if log lik not changing...
    if (iter==1) old.ll <- fit$l else {
      if (abs(old.ll-fit$l)<100*control$eps*abs(fit$l)) break
      old.ll <- fit$l
    }
  }
  fit$sp <- exp(lsp)
  fit$iter <- iter
  fit$outer.info <- list(iter = iter,score.hist=score.hist[1:iter])
  fit$outer.info$conv <- if (iter==200) "iteration limit reached" else "full convergence"
  if (!is.null(fit$warn)&&length(fit$warn)>0) for (i in 1:length(fit$warn)) warning(fit$warn[[i]])
  fit
} ## efsud

gam.fit5.post.proc <- function(object,Sl,L,lsp0,S,off,gamma) {
## object is object returned by gam.fit5, Sl is penalty object, L maps working sp
## vector to full sp vector 
## Computes:
## R - unpivoted Choleski of estimated expected hessian of ll 
## Vb - the Bayesian cov matrix,
## Ve - "frequentist" alternative
## F - the EDF matrix
## edf = diag(F) and edf2 = diag(2F-FF)
## Main issue is that lbb and lbb + St must be +ve definite for
## F to make sense.
## NOTE: what comes in is in stabilizing parameterization from 
##       gam.fit5, and may have had parameters dropped. 
##       possibly initial reparam needs to be undone here as well
##       before formation of F....
  lbb <- -object$lbb ## Hessian of log likelihood in fit parameterization
  p <- ncol(lbb)
  ipiv <- piv <- attr(object$L,"pivot")
  ipiv[piv] <- 1:p
  ##  Vb0 <- crossprod(forwardsolve(t(object$L),diag(object$D,nrow=p)[piv,])[ipiv,])

  ## Bayes cov matrix with learning rate 1/gamma. Wrong - parameterization s.t. Vp*gamma is it
  #Vl <- if (gamma==1) NULL else chol2inv(chol(-object$lbb/gamma+object$St))
  

  ## need to pre-condition lbb before testing rank...
  lbb <- object$D*t(object$D*lbb)
 
  R <- suppressWarnings(chol(lbb,pivot=TRUE)) 
  
  if (attr(R,"rank") < ncol(R)) { 
    ## The hessian of the -ve log likelihood is not +ve definite
    ## Find the "nearest" +ve semi-definite version and use that
    retry <- TRUE;tol <- 0
    eh <- eigen(lbb,symmetric=TRUE)
    mev <- max(eh$values);dtol <- 1e-7
    while (retry) {
      eh$values[eh$values<tol*mev] <- tol*mev
      R <- sqrt(eh$values)*t(eh$vectors)
      lbb <- crossprod(R)
      Hp <- lbb + object$D*t(object$D*object$St) ## pre-conditioned Hp
      ## Now try to invert it by Choleski with diagonal pre-cond,
      ## to get Vb
      object$L <- suppressWarnings(chol(Hp,pivot=TRUE))
      if (attr(object$L,"rank")==ncol(Hp)) {
        R <- t(t(R)/object$D) ## so R'R = lbb (original)
        retry <- FALSE
      } else { ##  failure: make more +ve def
        tol <- tol + dtol;dtol <- dtol*10
      }
    } ## retry  
  } else { ## hessian +ve def, so can simply use what comes from fit directly
    ipiv <- piv <- attr(R,"pivot")
    ipiv[piv] <- 1:p
    R <- t(t(R[,ipiv])/object$D) ## so now t(R)%*%R = lbb (original)
  } 
  ## DL'LD = penalized Hessian, which needs to be inverted
  ## to DiLiLi'Di = Vb, the Bayesian cov matrix...
  ipiv <- piv <- attr(object$L,"pivot")
  ipiv[piv] <- 1:p
  Vb <- crossprod(forwardsolve(t(object$L),diag(object$D,nrow=p)[piv,,drop=FALSE])[ipiv,,drop=FALSE])

  ## Insert any zeroes required as a result of dropping 
  ## unidentifiable parameters...
  if (sum(object$bdrop)) { ## some coefficients were dropped...
    q <- length(object$bdrop)
    ibd <- !object$bdrop
    Vtemp <- Vb; Vb <- matrix(0,q,q)
    Vb[ibd,ibd] <- Vtemp
    Rtemp <- R; R <- matrix(0,q,q)
    R[ibd,ibd] <- Rtemp
    lbbt <- lbb;lbb <- matrix(0,q,q)
    lbb[ibd,ibd] <- lbbt
  }  

  edge.correct <- FALSE 
  ## compute the smoothing parameter uncertainty correction...
  if (!is.null(object$outer.info$hess)&&!is.null(object$db.drho)) {
    hess <- object$outer.info$hess
    edge.correct <- if (is.null(attr(hess,"edge.correct"))) FALSE else TRUE
    K <- if (edge.correct) 2 else 1
    for (k in 1:K) {
      if (k==1) { ## fitted model computations
        db.drho <- object$db.drho
        dw.drho <- object$dw.drho
        lsp <- log(object$sp)
      } else { ## edge corrected model computations
        db.drho <- attr(hess,"db.drho1")
        dw.drho <- attr(hess,"dw.drho1")
        lsp <- attr(hess,"lsp1")
	hess <- attr(hess,"hess1")
      }
      if (!is.null(L)) db.drho <- db.drho%*%L ## transform to derivs w.r.t. working
      ev <- eigen(hess,symmetric=TRUE)
      d <- ev$values;ind <- d <= 0
      d[ind] <- 0;d[!ind] <- 1/sqrt(d[!ind])
      db.drho <- Sl.inirep(Sl,db.drho,1,0)
      Vc <- crossprod((d*t(ev$vectors))%*%t(db.drho)) ## first correction
   
      d <- ev$values; d[ind] <- 0;
      d <- if (k==1) 1/sqrt(d+1/50) else 1/sqrt(d+1e-7)
  
      Vr <- crossprod(d*t(ev$vectors))
     
      if (k==1) {
        Vc1 <- Vc; Vr1 <- Vr; lsp1 <- lsp ## un-shifted version to use for edf
      } 
      ## reverse the various re-parameterizations...
    }
    rp <- if (edge.correct) attr(object$outer.info$hess,"rp") else object$rp
    #Vc <- Sl.repara(rp,Vc,inverse=TRUE) ## BUG: appears to be double correction - db already corrected??
    #Vc <-  Sl.initial.repara(Sl,Vc,inverse=TRUE)
    V.sp <- Vr;attr(V.sp,"L") <- L
  } else {
    Vc <- 0; db.drho <- object$db.drho
    V.sp <- NULL
  }  
  Vb <- Sl.repara(object$rp,Vb,inverse=TRUE)
  Vb <-  Sl.initial.repara(Sl,Vb,inverse=TRUE)
  Vc <- Vb + Vc
  if (edge.correct) { 
    Vc1 <- Sl.repara(object$rp,Vc1,inverse=TRUE) 
    Vc1 <-  Sl.initial.repara(Sl,Vc1,inverse=TRUE)
    Vc1 <- Vb + Vc1 
  } 
  #R <- Sl.repara(object$rp,R,inverse=TRUE,both.sides=FALSE)
  R <- Sl.repa(object$rp,R,r=1)
  R <-  Sl.initial.repara(Sl,R,inverse=TRUE,both.sides=FALSE,cov=FALSE)
  F <- Vb%*%crossprod(R)
  Ve <- F%*%Vb ## 'frequentist' cov matrix
  edf <- diag(F)
  ## note that edf1 is a heuristic upper bound on EDF - it's the 
  ## df of the best unpenalized approx to the 1st order bias corrected
  ## model. This is larger than edf2 should be, because of bias correction variability,
  ##  but is bounded in a way that is not *guaranteed* for edf2. Note that 
  ## justification only applies to sum(edf1/2) not elementwise   
  if (!is.null(object$outer.info$hess)&&!is.null(object$db.drho)) { 
    ## second correction term is easier computed in original parameterization...
    Vc <- Vc + Vb.corr(R,L,lsp0,S,off,dw=NULL,w=NULL,lsp,Vr)
    if (edge.correct) Vc1 <- Vc1 +
      Vb.corr(R,L,lsp0,S,off,dw=NULL,w=NULL,lsp1,Vr1) else Vc1 <- Vc
  }
  edf1 <- 2*edf - rowSums(t(F)*F) 
  edf2 <- if (edge.correct) rowSums(Vc1*crossprod(R)) else rowSums(Vc*crossprod(R))
  if (sum(edf2)>sum(edf1)) edf2 <- edf1 
  ## note hat not possible here...
  list(Vc=Vc,Vp=Vb,Ve=Ve,V.sp=V.sp,edf=edf,edf1=edf1,edf2=edf2,#F=F,
       R=R,db.drho=db.drho)
} ## gam.fit5.post.proc


deriv.check5 <- function(x, y, sp, 
            weights = rep(1, length(y)), start = NULL,
            offset = rep(0, length(y)),Mp,family = gaussian(), 
            control = gam.control(),deriv=2,eps=1e-7,spe=1e-3,
            Sl,gamma=1,nei=nei,...)
## FD checking of derivatives for gam.fit5: a debugging routine
{  if (!deriv%in%c(1,2)) stop("deriv should be 1 or 2")
   if (control$epsilon>1e-9) control$epsilon <- 1e-9 
   ## first obtain the fit corresponding to sp...
   b <- gam.fit5(x=x,y=y,lsp=sp,Sl=Sl,weights=weights,offset=offset,deriv=deriv,
        family=family,control=control,Mp=Mp,start=start,gamma=gamma,nei=nei)
   ## now get the derivatives of the likelihood w.r.t. coefs...
   ll <- family$ll(y=y,X=x,coef=b$coefficients,wt=weights,family=family,
                   deriv=1,d1b=0,d2b=0,Hp=NULL,rank=0,fh=NULL,D=NULL)
   ## and finite difference versions of these...
   p <- length(b$coefficients)
   fdg <- rep(0,p)
   fdh <- matrix(0,p,p)
   for (i in 1:p) {
     coef1 <- b$coefficients;coef1[i] <- coef1[i] + eps
     ll1 <- family$ll(y=y,X=x,coef=coef1,wt=weights,family=family,
                      deriv=1,d1b=0,d2b=0,Hp=NULL,rank=0,fh=NULL,D=NULL)
     fdg[i] <- (ll1$l - ll$l)/eps
     fdh[,i] <- (ll1$lb - ll$lb)/eps
   }
   ## display them... 
   oask <- devAskNewPage(TRUE)
   on.exit(devAskNewPage(oask))
   plot(ll$lb,fdg,xlab="computed",ylab="FD",main="grad of log lik");abline(0,1)
   cat("log lik grad cor. =",cor(ll$lb,fdg),"\n")
   plot(ll$lbb,fdh,xlab="computed",ylab="FD",main="hess of log lik");abline(0,1)
   cat("log lik hess cor. =",cor(as.numeric(ll$lbb),as.numeric(fdh)),"\n")
   ## now we need to investigate the derivatives w.r.t. the log smoothing parameters.    
   M <- length(sp) ## number of smoothing parameters
   fd.br <- matrix(0,p,M)
   REML1 <- rep(0,M)
   fd.dH <- list()
   if (!is.null(b$b2)) fd.br2 <- b$b2*0
   k <- 0
   for (i in 1:M) { ## the smoothing parameter loop
     sp0 <- sp1 <- sp;sp1[i] <- sp[i] + spe/2;sp0[i] <- sp[i] - spe/2
     b0 <- gam.fit5(x=x,y=y,lsp=sp0,Sl=Sl,weights=weights,offset=offset,deriv=1,
          family=family,control=control,Mp=Mp,start=start,gamma=gamma,nei=nei)
     b1 <- gam.fit5(x=x,y=y,lsp=sp1,Sl=Sl,weights=weights,offset=offset,deriv=1,
          family=family,control=control,Mp=Mp,start=start,gamma=gamma,nei=nei)
     fd.br[,i] <- (b1$coefficients - b0$coefficients)/spe
     if (!is.null(b$b2)) {
       for (j in i:M) {
         k <- k + 1
         fd.br2[,k] <- (b1$db.drho[,j]-b0$db.drho[,j])/spe
       }	
     }  
     REML1[i] <- (b1$REML-b0$REML)/spe
     fd.dH[[i]] <- (b1$lbb - b0$lbb)/spe
   }
   ## plot db.drho against fd versions...
   for (i in 1:M) {
     plot(b$db.drho[,i],fd.br[,i],xlab="computed",ylab="FD",main="db/drho");abline(0,1)
     cat("cor db/drho[",i,"] = ",cor(b$db.drho[,i],fd.br[,i]),"\n")
   }
   ## second deriv of b
   if (!is.null(b$b2)) for (i in 1:ncol(b$b2)) {
     plot(b$b2[,i],fd.br2[,i],xlab="computed",ylab="FD",main="d2b/drhorho");abline(0,1)
     cat("cor d2b[",i,"] = ",cor(b$b2[,i],fd.br2[,i]),"\n")
   }
   ## plot first deriv Hessian against FD version
   for (i in 1:M) {
     plot(b$dH[[i]],fd.dH[[i]],xlab="computed",ylab="FD",main="dH/drho");abline(0,1)
     cat("cor dH/drho[",i,"] = ",cor(as.numeric(b$dH[[i]]),as.numeric(fd.dH[[i]])),"\n")
   }
   list(fd=list(lb=fdg,lbb=fdh,REML1=REML1,db.drho=fd.br,dH=fd.dH),
        lb=ll$lb,lbb=ll$lbb,REML1=b$REML1,db.drho=b$db.drho,dH=b$dH)
} ## deriv.check5
Any scripts or data that you put into this service are public.
mgcv documentation built on May 29, 2024, 4:34 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
mgcv
Mixed GAM Computation Vehicle with Automatic Smoothness Estimation

R/gam.fit4.r
In mgcv: Mixed GAM Computation Vehicle with Automatic Smoothness Estimation

Defines functions deriv.check5 gam.fit5.post.proc efsud gam.fit5 efsudr gam.fit4 fmud.test corb fetad.test dDeta

Documented in dDeta gam.fit5.post.proc

Try the mgcv package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

mgcv Mixed GAM Computation Vehicle with Automatic Smoothness Estimation

R/gam.fit4.r In mgcv: Mixed GAM Computation Vehicle with Automatic Smoothness Estimation

Defines functions deriv.check5 gam.fit5.post.proc efsud gam.fit5 efsudr gam.fit4 fmud.test corb fetad.test dDeta

Documented in dDeta gam.fit5.post.proc

Try the mgcv package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

mgcv
Mixed GAM Computation Vehicle with Automatic Smoothness Estimation

R/gam.fit4.r
In mgcv: Mixed GAM Computation Vehicle with Automatic Smoothness Estimation