# R/fregre.gsam.vs.R In fda.usc: Functional Data Analysis and Utilities for Statistical Computing

#### Documented in fregre.gsam.vs

#' @title Variable Selection using Functional Additive Models
#'
#' @description  Computes functional GAM model between functional covariates
#'  \eqn{(X^1(t_1),\cdots,X^{q}(t_q))}{(X(t_1),...,X(t_q))} and non functional covariates
#'   \eqn{(Z^1,...,Z^p)}{(Z1,...,Zp)} with a scalar response \eqn{Y}.
#' @details This function is an extension of the functional generalized spectral additive
#' regression models: \code{\link{fregre.gsam}} where the \eqn{E[Y|X,Z]} is related to the
#' linear prediction \eqn{\eta} via a link function \eqn{g(\cdot)}{g(.)} with integrated
#' smoothness estimation by the smooth functions \eqn{f(\cdot)}{f(.)}.
#' \deqn{E[Y|X,Z])=\eta=g^{-1}(\alpha+\sum_{i=1}^{p}f_{i}(Z^{i})+\sum_{k=1}^{q}\sum_{j=1}^{k_q}{f_{j}^{k}(\xi_j^k)})}{E[Y|X,Z]=\eta=g^{-1}(\alpha+\sum_i  f_i(Z_{i})+\sum_k^q\sum_{j=1}^{k_q}{f_j^k(\xi_j^k)})}
#' where \eqn{\xi_j^k}{\xi_j^k} is the coefficient of the basis  function expansion of
#' \eqn{X^k}, (in PCA analysis \eqn{\xi_j^k}{\xi_j^k} is the score of the \eqn{j}-functional
#' PC of \eqn{X^k}.
#'
#' The smooth functions \eqn{f(\cdot)}{f(.)} can be added to the right hand side of the formula
#' to specify that the linear predictor depends on smooth functions of predictors using smooth
#' these as \eqn{Z\beta} and \eqn{\big<X(t),\beta\big>}{< X(t),\beta(t) >} in \code{\link{fregre.glm}}).

#' @param data List that containing the variables in the model.
#' "df" element is a data.frame containing the response and scalar covariates
#' (numeric and factors variables are allowed). Functional covariates of class
#'   \code{fdata} or \code{fd} are included as named components in the \code{data} list.
#' @param y Caracter string with the name of the scalar response variable.
# @param x Caracter string vector with the name of the scalar and functional
# potential covariates. If missing, all scalar and functional covariates are included.
#' @param include vector with the name of variables to use. By default \code{"all"}, all variables are used.
#' @param exclude vector with the name of variables to not use. By default  \code{"none"}, no variable is deleted.
#' @param family a description of the error distribution and link function to
#' be used in the model. This can be a character string naming a family
#' function, a family function or the result of a call to a family function.
#' (See \code{\link{family}} for details of family functions.)
#' @param weights weights
#' @param alpha Alpha value for testing the independence among covariate X and residual
#'  e in previous steps. By default is \code{0.05}.
#' @param basis.x Basis parameter options
#'  \itemize{
#'  \item{\code{list}} (recomended)  List of basis for functional covariates,
#'  see same argument in \code{\link{fregre.glm}}. By default,
#'  the function uses a basis of 3 PC to represent each functional covariate.
#'  \item{\code{vector}} (by default) Vector with two parameters:
#'  \enumerate{
#'   \item Type of basis. By default \code{basis.x[1]="pc"}, principal
#'    component basis is used  for each functional covariate included in the model.
#'    Other options \code{"pls"} and \code{"bspline"}.
#'   \item Maximum number of  basis elements \code{numbasis}  to be used.
#'  By default, \code{basis.x[2]=3}.
#'  }
#   \item Selection of number of basis components/elements:
#   if \code{basis.x[3]=TRUE}  (by default), the algorithm selects the
#   best components from the first \code{basis.x[2]}. If \code{basis.x[3]=FALSE}, the
#    model is estimated using \code{ncomp} components for each functional covariate.
#'    }
#' @param numbasis.opt Logical, if \code{FALSE} by default, for each functional
#'  covariate included in the model, the function uses all basis elements.
#'  Otherwise, the function selects the significant coefficients.
#'
#' @param kbs The dimension of the basis used to represent the smooth term. The default
#' depends on the number of variables that the smooth is a function of.
#' @param dcor.min Threshold for a variable to be entered into the model. X is discarded
#' if the distance correlation \eqn{R(X,e)< dcor.min} (e is the residual of previous steps).
#' @param par.model Model parameters.
#' @param xydist List with the inner distance matrices of each variable (all potential
#' covariates and the response).
#' @param trace Interactive Tracing and Debugging of Call.
# @param CV TRUE, Cross-validation (CV) is done.
# @param smooth If TRUE, a smooth estimate is made for all covariates included in the model
#  (less for factors). The model is adjusted with the estimated variable linearly or smoothly.
#  If the models are equivalent, the model is adjusted with the linearly estimated variable.
#'
#' @return Return an object corresponding to the estimated additive mdoel using
#' the selected variables (ame output as the\code{\link{fregre.gsam}} function) and the following elements:
#' \itemize{
#' \item{\code{gof}}, the goodness of fit for each step of VS algorithm.
#' \item{\code{i.predictor}}, \code{vector} with 1 if the variable is selected, 0 otherwise.
#' \item{\code{ipredictor}}, \code{vector} with the name of selected variables (in order of selection)
#' \item{\code{dcor}}, the value of distance correlation for each potential covariate and the residual of the model in each step.
#' }
#'
#' @note If the formula only contains a non functional explanatory variables (multivariate covariates),
#'  the function compute a standard  \code{\link{gam}} procedure.
#'
#' @author Manuel Feb-Bande, Manuel Oviedo de la Fuente
#' \email{manuel.oviedo@@udc.es}
#'
#'
#' @references Febrero-Bande, M., Gonz\'alez-Manteiga, W. and Oviedo de la
#' Fuente, M. Variable selection in functional additive regression models,
#' (2018).  Computational Statistics, 1-19. DOI: \doi{10.1007/s00180-018-0844-5}
#'
#' @keywords regression
#' @examples
#' \dontrun{
#' data(tecator)
#' x=tecator$absorp.fdata #' x1 <- fdata.deriv(x) #' x2 <- fdata.deriv(x,nderiv=2) #' y=tecator$y$Fat #' xcat0 <- cut(rnorm(length(y)),4) #' xcat1 <- cut(tecator$y$Protein,4) #' xcat2 <- cut(tecator$y$Water,4) #' ind <- 1:165 #' dat <- data.frame("Fat"=y, x1$data, xcat1, xcat2)
#' ldat <- ldata("df"=dat[ind,],"x"=x[ind,],"x1"=x1[ind,],"x2"=x2[ind,])
#' # 3 functionals (x,x1,x2), 3 factors (xcat0, xcat1, xcat2)
#' # and 100 scalars (impact poitns of x1)
#'
#' # Time consuming
#' res.gam0 <- fregre.gsam.vs(data=ldat,y="Fat"
#'             ,exclude="x2",numbasis.opt=T) # All the covariates
#' summary(res.gam0)
#' res.gam0$ipredictors #' #' res.gam1 <- fregre.gsam.vs(data=ldat,y="Fat") # All the covariates #' summary(res.gam1) #' res.gam1$ipredictors
#'
#' covar <- c("xcat0","xcat1","xcat2","x","x1","x2")
#' res.gam2 <- fregre.gsam.vs(data=ldat, y="Fat", include=covar)
#' summary(res.gam2)
#' res.gam2$ipredictors #' res.gam2$i.predictor
#'
#' res.gam3 <- fregre.gsam.vs(data=ldat,y="Fat",
#'             basis.x=c("type.basis"="pc","numbasis"=10))
#' summary(res.gam3)
#' res.gam3$ipredictors #' #' res.gam4 <- fregre.gsam.vs(data=ldat,y="Fat",include=c("x","x1","x2"), #' basis.x=c("type.basis"="pc","numbasis"=5),numbasis.opt=T) #' summary(res.gam4) #' res.gam4$ipredictors

#' lpc <- list("x"=create.pc.basis(ldat$x,1:4) #' ,"x1"=create.pc.basis(ldat$x1,1:3)
#'            ,"x2"=create.pc.basis(ldat$x2,1:12)) #' res.gam5 <- fregre.gsam.vs(data=ldat,y="Fat",basis.x=lpc) #' summary(res.gam5) #' res.gam6 <- fregre.gsam.vs(data=ldat,y="Fat",basis.x=lpc,numbasis.opt=T) #' summary(res.gam6) #' bsp <- create.fourier.basis(ldat$x$rangeval,7) #' lbsp <- list("x"=bsp,"x1"=bsp,"x2"=bsp) #' res.gam7 <- fregre.gsam.vs(data=ldat,y="Fat",basis.x=lbsp,kbs=4) #' summary(res.gam7) #' # Prediction like fregre.gsam() #' newldat <- ldata("df"=dat[-ind,],"x"=x[-ind,],"x1"=x1[-ind,], #' "x2"=x2[-ind,]) #' pred.gam1 <- predict(res.gam1,newldat) #' pred.gam2 <- predict(res.gam2,newldat) #' pred.gam3 <- predict(res.gam3,newldat) #' pred.gam4 <- predict(res.gam4,newldat) #' pred.gam5 <- predict(res.gam5,newldat) #' pred.gam6 <- predict(res.gam6,newldat) #' pred.gam7 <- predict(res.gam7,newldat) #' plot(dat[-ind,"Fat"],pred.gam1) #' points(dat[-ind,"Fat"],pred.gam2,col=2) #' points(dat[-ind,"Fat"],pred.gam3,col=3) #' points(dat[-ind,"Fat"],pred.gam4,col=4) #' points(dat[-ind,"Fat"],pred.gam5,col=5) #' points(dat[-ind,"Fat"],pred.gam6,col=6) #' points(dat[-ind,"Fat"],pred.gam7,col=7) #' pred2meas(newldat$df$Fat,pred.gam1) #' pred2meas(newldat$df$Fat,pred.gam2) #' pred2meas(newldat$df$Fat,pred.gam3) #' pred2meas(newldat$df$Fat,pred.gam4) #' pred2meas(newldat$df$Fat,pred.gam5) #' pred2meas(newldat$df$Fat,pred.gam6) #' pred2meas(newldat$df$Fat,pred.gam7) #' } #' @export fregre.gsam.vs <- function(data = list(), y, include = "all", exclude = "none" ,family = gaussian(), weights = NULL , basis.x = NULL , numbasis.opt = FALSE , kbs , dcor.min = 0.1, alpha = 0.05 , par.model, xydist, trace = FALSE ){ #0- print(0) verbose = trace #criterio = "sp" if (missing(y)) stop("The name of the response must be specified in the 'y' argument") # if (missing(alpha)) alpha <- .05 if (missing(par.model)) par.model <- list() #se puede cambiar la familia n.edf.correction <- FALSE namdata <- names(data) idf <- which("df"==namdata ) ydat <- data$df[,y]
namfunc <- names(data[-idf])
namnfunc <- setdiff(names(data$df),y) namnfunc <- setdiff(namnfunc,exclude) namfunc <- setdiff(namfunc,exclude) if (include[1] == "all"){ x <- c(namnfunc,namfunc) ifunc <- namfunc infunc <- namnfunc } else { x = include ifunc <- intersect(x,namfunc) infunc <- intersect(c(y,x),namnfunc) # xdatos <- as.list(data$df[,infunc,drop=F])
#    xdatos <- c(xdatos,data[ifunc])
}

# var.name <- names(x)
# a <- sapply(x, class, USE.NAMES = T)
# x <- x[, a != "character", drop = F]
# var.name <- names(x)
# if (include[1] != "all") {
#   var.name <- intersect(include, var.name)
# }
# if (exclude[1] != "none") {
#   var.name <- setdiff(var.name, exclude)
# }
xdatos <- as.list(data$df[,y,drop=F]) xdatos <- c(xdatos,as.list(data$df[,infunc,drop=F]),data[ifunc])
ldata0 <- xdatos
resp <- y
xentran <- NULL
tipoentran <- NULL

#1- caclulo= distancias de cada objeto consigo mismo
#  print("1 calculando distancias")
#  ldist0 <- dist.list(ldata0)
xynam <- names(ldata0)
nvar <- length(x)
if (missing(xydist))    {
xydist <- ldist0 <- dist.list(ldata0)
}    else {
ldist0 <- xydist[xynam]
}
# print("2 calculando distancias")
parar <- FALSE
it <- 1
basisx <- NULL
npredictors <- length(ldata0)
ipredictors <- numeric(npredictors-1)
names(ipredictors) <- setdiff(names(ldata0),resp)

nvar <- npredictors - 1
if (missing(kbs))
kbs <- - 1 #rep(-1,nvar)
#names(kbs) <- names(ipredictors)
basis.list <- FALSE
if (is.null(basis.x)) {
# print("base nula")
#ncomp <- rep(4,length(ifunc))
#names(ncomp) <- ifunc
ncomp <- 4
type.basis = "pc"
}  else {
# print("base usuario")
if (is.list(basis.x)){ # base proporcionada por el usuario
basis.list = TRUE
basisx <- basis.x
type.basis = basis.x[[1]]$type }else{ # print("base vector") if (length(basis.x)!=2) stop("basis.x must be a vector with length two") type.basis = basis.x[1] ncomp <- as.numeric(basis.x[2]) } } tbasis <- type.basis if (type.basis %in% c("fourier","bspline")) tbasis="basis" dcor=matrix(0,nrow= nvar,ncol= nvar) colnames(dcor)=c(names(ipredictors)) rownames(dcor)=1:nvar #2- caclulo correlacion de cada la distancia de la respuesta vs distancia del resto de objetos # print("2 calculando correlaciones") dist_resp <- dcor.y(ldist0,resp) dcor[it,names(dist_resp)] <- dist_resp*(dist_resp > dcor.min) n.edf <- length(ydat) fpredictors.nl <- "" form.nl <- paste(resp,"~",sep="") basis2 <- list() ycen <- data$df[,y]-mean(data$df[,y]) gof <- NULL anyfdata <- FALSE res.prev <- gam(as.formula(paste0(form.nl,1)),data=data$df,family=family)
while (!parar){
# print("3 Seleccion variable-Regresion")
# print("bucle")
esfactor <- FALSE #SE UTILIZA PARA NO PONER s(factor)
nam <-  names(dist_resp)
ind.xentra <- which.max(dist_resp)
xentra <- nam[ind.xentra]
#dd <- dcor.ttest(ldist0[[resp]],ldist0[[xentra]],distance=TRUE)
dd <- dcor.test(ldist0[[resp]],ldist0[[xentra]],n=n.edf)
#if (trace){      print(dd);      print(xentra)    }
if (verbose) {
print(it);      print(n.edf);      print(nam)
cat("Enter the variable ",xentra);      print(dd);      print(names(dd))
}
#if (is.null(basisx)) {     basisx <- list()    }
if (dd$p.value > alpha ) { parar=TRUE if (trace) print("The algorithm ends because no variable is significant") } else{ if (dd$estimate < dcor.min ) {
parar=TRUE
if (trace) print("The algorithm ends because dcor < dcor.min")
}
else {
rownames(dcor)[it] <- xentra
if (trace)      cat("Covariate: ")
if (trace)       print(xentra)
#if (is.fdata(ldata0[[xentra]]) & !basis.list) {
if (is.fdata(ldata0[[xentra]])) {
# print("clase de base")
par.basis.ipred <- list()
anyfdata <- TRUE
par.basis.ipred$fdataobj <- ldata0[[xentra]] #if (numbasis.opt | basis.list ){ if ( basis.list ){ type.basis = basis.x[[xentra]]$type
if (type.basis=="bspline" | type.basis=="fourier") ncomp <- basis.x[[xentra]]$nbasis if (type.basis=="pc" | type.basis=="pls") ncomp <- length(basis.x[[xentra]]$basis)
basisx[[xentra]] <- basis.x[[xentra]]
}

if (xentra %in% names(type.basis)){
itype <- which(xentra== names(type.basis))
type.basis.ipred <- type.basis[itype]
}    else {
type.basis.ipred <- type.basis[1]
}
tbasis <- type.basis.ipred
if (type.basis.ipred=="fourier") tbasis <- "basis"
if (type.basis.ipred=="bspline") tbasis <- "basis"

nam <- "fregre.gsam.cv"
par.basis.ipred$y <- resp#entra la etiqueta y no toda la variable como en el basis.cv o pc.cv par.basis.ipred$x <- xentra
par.basis.ipred$data <- data if (numbasis.opt) { res <- fregre.gsam.cv(data,resp,xentra, alpha = alpha ,family=family #,type.basis=tbasis, kbs = kbs ,type.basis=type.basis, kbs = kbs ,numbasis = ncomp, numbasis.opt=numbasis.opt) if (tbasis!="basis"){ res$basis.x[[xentra]]$basis <- res$basis.x[[xentra]]$basis[res$numbasis.opt,]
}
res$basis.x[[xentra]]$l <- res$numbasis.opt basisx[[xentra]] <- res$basis.x[[xentra]]
#basisb[[xentra]] <- res$basis.x[[xentra]] if (trace) print("results of internal function fregre.gsam.cv") if (trace) print(summary(res)) # parar=TRUE } else{ # print("no numbasis.opt") if (!basis.list){ # print("NO entra gsam.CV") switch(tbasis, "pc"={ best.pc <- 1:ncomp#[xentra] basis1 <- create.pc.basis(ldata0[[xentra]],best.pc) }, "pls"={ best.pc <- 1:ncomp#[xentra] basis1 <- create.pls.basis(ldata0[[xentra]],ldata0[[resp]],best.pc) },"basis"={ # best.pc <- 1:kmax #basis1 <- create.bspline.basis(ldata0[[xentra]]$rangeval,nbasis=ncomp)
basis1 <- create.fdata.basis(ldata0[[xentra]],l=1:ncomp,type.basis=type.basis,
rangeval=ldata0[[xentra]]$rangeval) args(create.fdata.basis) # basis2 <- basis1 }) basisx[[xentra]] <- basis1 } else basisx[[xentra]] <- basis.x[[xentra]]} } if (!parar) { # print("no parar"); print(xentra) xentran <- c(xentran,xentra) ind.xentra2 <- which(xentra==names(ldist0)) ldist0 <- ldist0[-ind.xentra2] #ipredictors[xentra] <- ipredictors[xentra]+1 #4- contruccion del modelo para esta variable #consido un catalogo de 4 posibilidades (lineal/nolineal, funcional/scalar) if (is.factor(ldata0[[xentra]])) esfactor=TRUE fx=FALSE if (esfactor) { fpredictors.nl <- xentra } else { fpredictors.nl <- paste("s(",xentra,",k=",kbs,",","fx=",fx,")", sep="",collapse="+") } nam.model <- "fregre.gsam" form.nl <- (paste(form.nl,"+",fpredictors.nl,sep="")) par.model$formula <- as.formula(form.nl)
par.model$data <- data par.model[["basis.x"]] <- basisx if (tbasis == "basis"){ par.model[["basis.b"]] <- basisx } par.model[["family"]] <- family res.nl <- do.call(nam.model,par.model) mejora <- res.prev$aic > res.nl$aic # print(res.nl); print(it); print(res.prev); print(mejora); print(res.prev$gcv.ubre);          print(res.nl$gcv.ubre) if (it>1 & mejora){ aov <- anova(res.nl,res.prev,test="F") mejora <- aov[["Pr(>F)"]][2] < alpha if (length(mejora)==0) mejora <- TRUE if (is.na(mejora)) mejora <- TRUE } # cat("mejora",res.prev$gcv.ubre, res.nl$gcv.ubre,res.prev$gcv.ubre > res.nl$gcv.ubre,"\n") if (mejora){ suma <- summary(res.nl) dd <- res.nl$dims
df <- dd[["p"]]
edf <- dd[["N"]] - dd[["p"]]
#         sr2 <- sum(res.nl$residuals^2)/edf r2 <- 1 - sum(res.nl$residuals^2)/sum(ycen^2)
if (n.edf.correction) n.edf <-  edf
ldata0[[resp]] <- res.nl$residuals ldist0[[resp]] <- as.matrix(dist(ldata0[[resp]]), diag =TRUE, upper = TRUE, p = 2) #2- caclulo correlacion de cada la distancia de la respuesta vs distancia del resto de objetos if (it==(npredictors-1)) { parar=TRUE #gof <- rbindgof,c(suma$logLik,suma$BIC,suma$AIC,edf,r2))
gof <- rbind(gof,c(AIC(res.nl),deviance(res.nl),res.nl$df.residual,suma$r.sq,suma$dev.expl,res.nl$gcv.ubre))
}        else{
it <- it+1
# print("3 calculando correlaciones")
dist_resp <- dcor.y(ldist0,resp)
# LO PONE A 0
dcor[it,names(dist_resp)]=dist_resp*(dist_resp > dcor.min)
#  print(dcor[it,names(dist_resp)])
}
#     print(summary(res.nl))
if (!parar){
#            gof <- rbind(gof,drop(c(suma$logLik,suma$BIC,suma$AIC,edf,r2))) gof <- rbind(gof,c(AIC(res.nl),deviance(res.nl), res.nl$df.residual,suma$r.sq, suma$dev.expl,res.nl$gcv.ubre)) } else { cat("The variable ",xentra,"does not improve the fit of the model\n") } res.prev <- res.nl } else{ #print("NOOOOOOO MEJORAAAAAAAA") if (it==nvar) parar=TRUE res.nl <- res.prev xentran <- xentran[-length(xentran)] dist_resp[xentra]<-0 if (all(dist_resp==0)) parar=TRUE } } } } } gof <- data.frame(xentran,(gof)) if (is.null(xentran)) { warning("No variable selected, a null model is estimated") res.nl <- fregre.gsam(as.formula(paste(resp,"~1",sep="")),data=data,family=family) suma <- summary(res.nl) gof <- data.frame(rbind(c(1,AIC(res.nl),deviance(res.nl), res.nl$df.residual,suma$r.sq, suma$dev.expl,res.nl$gcv.ubre))) } if (length(gof)==1) gof <- numeric(7) names(gof) <- c("xentra","AIC","deviance","df.residual","r.sq","dev.expl","GCV.ubre") res.nl$gof <- gof
res.nl$i.predictor = ipredictors res.nl$i.predictor[xentran] <- 1
res.nl$ipredictors = xentran res.nl$xydist = xydist
res.nl\$dcor = dcor[1:(it-1),]
return(res.nl)
}


## Try the fda.usc package in your browser

Any scripts or data that you put into this service are public.

fda.usc documentation built on Oct. 17, 2022, 9:06 a.m.