estimate(e2,f=function(p) predict(e2,p=p,newdata=newd))$coefmat) head(p)  The fitted function can be obtained with the following code: plot(I(eta2-z) ~ eta1, data=d, col=Col("black",0.5), pch=16, xlab=expression(eta[1]), ylab=expression(eta[2]), xlim=c(-4,4)) lines(Estimate ~ eta1, data=as.data.frame(p), col="darkblue", lwd=5) confband(p[,1], lower=p[,4], upper=p[,5], polygon=TRUE, border=NA, col=Col("darkblue",0.2))  # Cross-validation A more formal comparison of the different models can be obtained by cross-validation. Here we specify linear, quadratic and cubic spline models with 4 and 9 degrees of freedom. m2a <- nonlinear(m2, type="linear", eta2~eta1) m2b <- nonlinear(m2, type="quadratic", eta2~eta1) kn1 <- seq(-3,3,length.out=5) kn2 <- seq(-3,3,length.out=8) m2c <- nonlinear(m2, type="spline", knots=kn1, eta2~eta1) m2d <- nonlinear(m2, type="spline", knots=kn2, eta2~eta1)  To assess the model fit average RMSE is estimated with 5-fold cross-validation repeated two times ## Scale models in stage 2 to allow for a fair RMSE comparison d0 <- d for (i in endogenous(m2)) d0[,i] <- scale(d0[,i],center=TRUE,scale=TRUE) ## Repeated 5-fold cross-validation: ff <- lapply(list(linear=m2a,quadratic=m2b,spline4=m2c,spline6=m2d), function(m) function(data,...) twostage(m1,m,data=data,stderr=FALSE,control=list(start=coef(e0),contrain=TRUE))) fit.cv <- cv(ff,data=d,K=5,rep=2,mc.cores=parallel::detectCores(),seed=1)  ## To save time building the vignettes on CRAN, we cache time consuming computations if (fullVignette) { fit.cv$fit <- NULL
saveRDS(fit.cv, "nonlinear_fitcv.rds")
} else {
}

summary(fit.cv)


Here the RMSE is in favour of the splines model with 4 degrees of freedom:

fit <- lapply(list(m2a,m2b,m2c,m2d),
function(x) {
e <- twostage(m1,x,data=d)
pr <- cbind(eta1=newd$eta1,predict(e,newdata=newd$eta1,x=TRUE))
return(list(estimate=e,predict=as.data.frame(pr)))
})

plot(I(eta2-z) ~ eta1, data=d, col=Col("black",0.5), pch=16,
xlab=expression(eta[1]), ylab=expression(eta[2]), xlim=c(-4,4))
col <- c("orange","darkred","darkgreen","darkblue")
lty <- c(3,4,1,5)
for (i in seq_along(fit)) {
with(fit[[i]]$pr, lines(eta2 ~ eta1, col=col[i], lwd=4, lty=lty[i])) } legend("bottomright", c("linear","quadratic","spline(df=4)","spline(df=6)"), col=col, lty=lty, lwd=3)  For convenience, the function twostageCV can be used to do the cross-validation (also for choosing the mixture distribution via the nmix argument, see the section below). For example, selmod <- twostageCV(m1, m2, data=d, df=2:4, nmix=1:2, nfolds=2, rep=1, mc.cores=parallel::detectCores())  ## To save time building the vignettes on CRAN, we cache time consuming computations if (fullVignette) { saveRDS(summary(selmod), "nonlinear_selmod.rds") } else { selmod <- readRDS("nonlinear_selmod.rds") }  applies cross-validation (here just 2 folds for simplicity) to select the best splines with degrees of freedom varying from from 1-3 (the linear model is automatically included) selmod  # Specification of general functional forms Next, we show how to specify a general functional relation of multiple different latent or exogenous variables. This is achieved via the predict.fun argument. To illustrate this we include interactions between the latent variable (\eta_{1}) and a dichotomized version of the covariate (z) d$g <- (d$z<0)*1 ## Group variable mm1 <- regression(m1, ~g) # Add grouping variable as exogenous variable (effect specified via 'predict.fun') mm2 <- regression(m2, eta2~ u1+u2+u1:g+u2:g+z) pred <- function(mu,var,data,...) { cbind("u1"=mu[,1],"u2"=mu[,1]^2+var[1], "u1:g"=mu[,1]*data[,"g"],"u2:g"=(mu[,1]^2+var[1])*data[,"g"]) } ee1 <- twostage(mm1, model2=mm2, data=d, predict.fun=pred) estimate(ee1,keep="eta2~u",regex=TRUE)  A formal test show no statistically significant effect of this interaction summary(estimate(ee1,keep="(:g)", regex=TRUE))  # Mixture models Lastly, we demonstrate how the distributional assumptions of stage 1 model can be relaxed by letting the conditional distribution of the latent variable given covariates follow a Gaussian mixture distribution. The following code explictly defines the parameter constraints of the model by setting the intercept of the first indicator variable, (x_{1}), to zero and the factor loading parameter of the same variable to one. m1 <- baptize(m1) ## Label all parameters intercept(m1, ~x1+eta1) <- list(0,NA) ## Set intercept of x1 to zero. Remove the label of eta1 regression(m1,x1~eta1) <- 1 ## Factor loading fixed to 1  The mixture model may then be estimated using the mixture method (note, this requires the mets package to be installed), where the Parameter names shared across the different mixture components given in the list will be constrained to be identical in the mixture model. Thus, only the intercept of (\eta_{1}) is allowed to vary between the mixtures. set.seed(1) em0 <- mixture(m1, k=2, data=d)  To decrease the risk of using a local maximizer of the likelihood we can rerun the estimation with different random starting values em0 <- NULL ll <- c() for (i in 1:5) { set.seed(i) em <- mixture(m1, k=2, data=d, control=list(trace=0)) ll <- c(ll,logLik(em)) if (is.null(em0) || logLik(em0)<tail(ll,1)) em0 <- em }  ## To save time building the vignettes on CRAN, we cache time consuming computations if (fullVignette) { saveRDS(em0, "nonlinear_em0.rds") } else { em0 <- readRDS("nonlinear_em0.rds") }  summary(em0)  Measured by AIC there is a slight improvement in the model fit using the mixture model e0 <- estimate(m1,data=d) AIC(e0,em0)  The spline model may then be estimated as before with the two-stage method em2 <- twostage(em0,m2,data=d) em2  In this example the results are very similar to the Gaussian model: plot(I(eta2-z) ~ eta1, data=d, col=Col("black",0.5), pch=16, xlab=expression(eta[1]), ylab=expression(eta[2])) lines(Estimate ~ eta1, data=as.data.frame(p), col="darkblue", lwd=5) confband(p[,1], lower=p[,4], upper=p[,5], polygon=TRUE, border=NA, col=Col("darkblue",0.2)) pm <- cbind(eta1=newd$eta1,
estimate(em2, f=function(p) predict(e2,p=p,newdata=newd))\$coefmat)
lines(Estimate ~ eta1, data=as.data.frame(pm), col="darkred", lwd=5)
confband(pm[,1], lower=pm[,4], upper=pm[,5], polygon=TRUE,
border=NA, col=Col("darkred",0.2))
legend("bottomright", c("Gaussian","Mixture"),
col=c("darkblue","darkred"), lwd=2, bty="n")


# Bibliography

[holstjoergensen_lava] Holst & Budtz-J\orgensen, Linear latent variable models: the lava-package, Computational Statistics, 28(4), 1385-1452 (2013). doi.

[lava_nlin] Klaus Kähler Holst & Esben Budtz-Jørgensen, A two-stage estimation procedure for non-linear structural equation models, Biostatistics, (in press), (2020). doi.

## Try the lava package in your browser

Any scripts or data that you put into this service are public.

lava documentation built on Sept. 5, 2021, 5:43 p.m.