R/fregre.gsam.vs.R
In fda.usc: Functional Data Analysis and Utilities for Statistical Computing

Documented in fregre.gsam.vs

#' @title Variable Selection using Functional Additive Models      
#' 
#' @description  Computes functional GAM model between functional covariates
#'  \eqn{(X^1(t_1),\cdots,X^{q}(t_q))}{(X(t_1),...,X(t_q))} and non functional covariates
#'   \eqn{(Z^1,...,Z^p)}{(Z1,...,Zp)} with a scalar response \eqn{Y}.
#' @details This function is an extension of the functional generalized spectral additive 
#' regression models: \code{\link{fregre.gsam}} where the \eqn{E[Y|X,Z]} is related to the 
#' linear prediction \eqn{\eta} via a link function \eqn{g(\cdot)}{g(.)} with integrated 
#' smoothness estimation by the smooth functions \eqn{f(\cdot)}{f(.)}. 
#' \deqn{E[Y|X,Z] = \eta = g^{-1}(\alpha + \sum_{i=1}^{p} f_{i}(Z^{i}) + \sum_{k=1}^{q} \sum_{j=1}^{k_q} f_{j}^{k}(\xi_j^k))}
#' where \eqn{\xi_j^k}{\xi_j^k} is the coefficient of the basis function expansion of 
#' \eqn{X^k}, (in PCA analysis, \eqn{\xi_j^k}{\xi_j^k} is the score of the \eqn{j}-functional
#' PC of \eqn{X^k}).
#'  
#' The smooth functions \eqn{f(\cdot)}{f(.)} can be added to the right-hand side of the formula
#' to specify that the linear predictor depends on smooth functions of predictors using smooth 
#' terms \link[mgcv]{s} and \link[mgcv]{te} as in \link[mgcv]{gam} (or linear functionals of 
#' these as \eqn{Z\beta} and \eqn{\langle X(t), \beta(t) \rangle} in \code{\link{fregre.glm}}).
#' @param data List that containing the variables in the model. 
#' "df" element is a data.frame containing the response and scalar covariates 
#' (numeric and factors variables are allowed). Functional covariates of class
#'   \code{fdata} or \code{fd} are included as named components in the \code{data} list. 
#' @param y Caracter string with the name of the scalar response variable.
# @param x Caracter string vector with the name of the scalar and functional 
# potential covariates. If missing, all scalar and functional covariates are included.                     
#' @param include vector with the name of variables to use. By default \code{"all"}, all variables are used.
#' @param exclude vector with the name of variables to not use. By default  \code{"none"}, no variable is deleted.
#' @param family a description of the error distribution and link function to
#' be used in the model. This can be a character string naming a family
#' function, a family function or the result of a call to a family function.
#' (See \code{\link{family}} for details of family functions.)
#' @param weights weights 
#' @param alpha Alpha value for testing the independence among covariate X and residual
#'  e in previous steps. By default is \code{0.05}.
#' @param basis.x Basis parameter options
#'  \itemize{
#'  \item{\code{list}} (recomended)  List of basis for functional covariates,
#'  see same argument in \code{\link{fregre.glm}}. By default, 
#'  the function uses a basis of 3 PC to represent each functional covariate. 
#'  \item{\code{vector}} (by default) Vector with two parameters:
#'  \enumerate{
#'   \item Type of basis. By default \code{basis.x[1]="pc"}, principal
#'    component basis is used  for each functional covariate included in the model.
#'    Other options \code{"pls"} and \code{"bspline"}.  
#'   \item Maximum number of  basis elements \code{numbasis}  to be used.
#'  By default, \code{basis.x[2]=3}. 
#'  }
#   \item Selection of number of basis components/elements: 
#   if \code{basis.x[3]=TRUE}  (by default), the algorithm selects the
#   best components from the first \code{basis.x[2]}. If \code{basis.x[3]=FALSE}, the
#    model is estimated using \code{ncomp} components for each functional covariate.
#'    }
#' @param numbasis.opt Logical, if \code{FALSE} by default, for each functional 
#'  covariate included in the model, the function uses all basis elements. 
#'  Otherwise, the function selects the significant coefficients.
#'  
#' @param kbs The dimension of the basis used to represent the smooth term. The default 
#' depends on the number of variables that the smooth is a function of.
#' @param dcor.min Threshold for a variable to be entered into the model. X is discarded 
#' if the distance correlation \eqn{R(X,e)< dcor.min} (e is the residual of previous steps).
#' @param par.model Model parameters.
#' @param xydist List with the inner distance matrices of each variable (all potential 
#' covariates and the response).
#' @param trace Interactive Tracing and Debugging of Call.
# @param CV TRUE, Cross-validation (CV) is done.
# @param smooth If TRUE, a smooth estimate is made for all covariates included in the model
#  (less for factors). The model is adjusted with the estimated variable linearly or smoothly. 
#  If the models are equivalent, the model is adjusted with the linearly estimated variable.
#' 
#' @return Return an object corresponding to the estimated additive mdoel using 
#' the selected variables (ame output as the\code{\link{fregre.gsam}} function) and the following elements:
#' \itemize{
#' \item \code{gof}: the goodness of fit for each step of VS algorithm.
#' \item \code{i.predictor}: \code{vector} with 1 if the variable is selected, 0 otherwise.
#' \item \code{ipredictor}: \code{vector} with the name of selected variables (in order of selection)
#' \item \code{dcor}: the value of distance correlation for each potential covariate and the residual of the model in each step.
#' }
#' 
#' @note If the formula only contains a non functional explanatory variables (multivariate covariates),
#'  the function compute a standard  \link[mgcv]{gam} procedure.
#' 
#' @author Manuel Feb-Bande, Manuel Oviedo de la Fuente
#' \email{manuel.oviedo@@udc.es}
#' 
#' @seealso See Also as:  \code{\link{predict.fregre.gsam}} and \link[mgcv]{summary.gam}.
#' Alternative methods: \code{\link{fregre.glm}}, \code{\link{fregre.gsam}}
#'  and \code{\link{fregre.gkam}}.
#'  
#' @references Febrero-Bande, M., Gonz\'alez-Manteiga, W. and Oviedo de la
#' Fuente, M. Variable selection in functional additive regression models,
#' (2018).  Computational Statistics, 1-19. DOI: \doi{10.1007/s00180-018-0844-5}
#' 
#' @keywords regression
#' @examples 
#' \dontrun{ 
#' data(tecator)
#' x=tecator$absorp.fdata
#' x1 <- fdata.deriv(x)
#' x2 <- fdata.deriv(x,nderiv=2)
#' y=tecator$y$Fat
#' xcat0 <- cut(rnorm(length(y)),4) 
#' xcat1 <- cut(tecator$y$Protein,4)
#' xcat2 <- cut(tecator$y$Water,4)
#' ind <- 1:165
#' dat <- data.frame("Fat"=y, x1$data, xcat1, xcat2)
#' ldat <- ldata("df"=dat[ind,],"x"=x[ind,],"x1"=x1[ind,],"x2"=x2[ind,])
#' # 3 functionals (x,x1,x2), 3 factors (xcat0, xcat1, xcat2)
#' # and 100 scalars (impact poitns of x1) 
#' 
#' # Time consuming
#' res.gam0 <- fregre.gsam.vs(data=ldat,y="Fat"
#'             ,exclude="x2",numbasis.opt=T) # All the covariates
#' summary(res.gam0)
#' res.gam0$ipredictors
#' 
#' res.gam1 <- fregre.gsam.vs(data=ldat,y="Fat") # All the covariates
#' summary(res.gam1)
#' res.gam1$ipredictors
#' 
#' covar <- c("xcat0","xcat1","xcat2","x","x1","x2")
#' res.gam2 <- fregre.gsam.vs(data=ldat, y="Fat", include=covar)
#' summary(res.gam2)
#' res.gam2$ipredictors 
#' res.gam2$i.predictor
#' 
#' res.gam3 <- fregre.gsam.vs(data=ldat,y="Fat",
#'             basis.x=c("type.basis"="pc","numbasis"=10))
#' summary(res.gam3)
#' res.gam3$ipredictors
#' 
#' res.gam4 <- fregre.gsam.vs(data=ldat,y="Fat",include=c("x","x1","x2"),
#' basis.x=c("type.basis"="pc","numbasis"=5),numbasis.opt=T)
#' summary(res.gam4)
#' res.gam4$ipredictors

#' lpc <- list("x"=create.pc.basis(ldat$x,1:4)
#'            ,"x1"=create.pc.basis(ldat$x1,1:3)
#'            ,"x2"=create.pc.basis(ldat$x2,1:12))
#' res.gam5 <- fregre.gsam.vs(data=ldat,y="Fat",basis.x=lpc)
#' summary(res.gam5)
#' res.gam6 <- fregre.gsam.vs(data=ldat,y="Fat",basis.x=lpc,numbasis.opt=T)
#' summary(res.gam6)

#' bsp <- create.fourier.basis(ldat$x$rangeval,7)
#' lbsp <- list("x"=bsp,"x1"=bsp,"x2"=bsp)
#' res.gam7 <- fregre.gsam.vs(data=ldat,y="Fat",basis.x=lbsp,kbs=4)
#' summary(res.gam7)

#' # Prediction like fregre.gsam() 
#' newldat <- ldata("df"=dat[-ind,],"x"=x[-ind,],"x1"=x1[-ind,],
#'                 "x2"=x2[-ind,])
#' pred.gam1 <- predict(res.gam1,newldat)
#' pred.gam2 <- predict(res.gam2,newldat)
#' pred.gam3 <- predict(res.gam3,newldat)
#' pred.gam4 <- predict(res.gam4,newldat)
#' pred.gam5 <- predict(res.gam5,newldat)
#' pred.gam6 <- predict(res.gam6,newldat)
#' pred.gam7 <- predict(res.gam7,newldat)
#' plot(dat[-ind,"Fat"],pred.gam1)
#' points(dat[-ind,"Fat"],pred.gam2,col=2)
#' points(dat[-ind,"Fat"],pred.gam3,col=3)
#' points(dat[-ind,"Fat"],pred.gam4,col=4)
#' points(dat[-ind,"Fat"],pred.gam5,col=5)
#' points(dat[-ind,"Fat"],pred.gam6,col=6)
#' points(dat[-ind,"Fat"],pred.gam7,col=7)

#' pred2meas(newldat$df$Fat,pred.gam1)
#' pred2meas(newldat$df$Fat,pred.gam2)
#' pred2meas(newldat$df$Fat,pred.gam3)
#' pred2meas(newldat$df$Fat,pred.gam4)
#' pred2meas(newldat$df$Fat,pred.gam5)
#' pred2meas(newldat$df$Fat,pred.gam6)
#' pred2meas(newldat$df$Fat,pred.gam7)
#' }


#' @export
fregre.gsam.vs  <- function(data = list(), y, 
                            include = "all", exclude = "none"
                            ,family = gaussian(), weights = NULL 
                            , basis.x = NULL 
                            , numbasis.opt = FALSE
                            , kbs , dcor.min = 0.1, alpha = 0.05
                            , par.model, xydist, trace = FALSE
){
#0-   print(0)  
  verbose = trace
  #criterio = "sp"
  if (missing(y)) 
    stop("The name of the response must be specified in the 'y' argument")  
  # if (missing(alpha)) alpha <- .05
  if (missing(par.model)) 
    par.model <- list() #se puede cambiar la familia
  n.edf.correction <- FALSE
  namdata <- names(data)
  idf <- which("df"==namdata )
  ydat <- data$df[,y]
  namfunc <- names(data[-idf])
  namnfunc <- setdiff(names(data$df),y)
  namnfunc <- setdiff(namnfunc,exclude)
  namfunc <- setdiff(namfunc,exclude)
  if (include[1] == "all"){
    x <- c(namnfunc,namfunc)
    ifunc <- namfunc
    infunc <- namnfunc
  }  else {
    x = include
    ifunc <- intersect(x,namfunc)
    infunc <- intersect(c(y,x),namnfunc)
    #    xdatos <- as.list(data$df[,infunc,drop=F])
    #    xdatos <- c(xdatos,data[ifunc])
  } 
  
  # var.name <- names(x)
  # a <- sapply(x, class, USE.NAMES = T)
  # x <- x[, a != "character", drop = F]
  # var.name <- names(x)
  # if (include[1] != "all") {
  #   var.name <- intersect(include, var.name)
  # }
  # if (exclude[1] != "none") {
  #   var.name <- setdiff(var.name, exclude)
  # }
  xdatos <- as.list(data$df[,y,drop=F])
  xdatos <- c(xdatos,as.list(data$df[,infunc,drop=F]),data[ifunc])
  ldata0 <- xdatos
  resp <- y
  xentran <- NULL
  tipoentran <- NULL
  
  #1- caclulo= distancias de cada objeto consigo mismo
  #  print("1 calculando distancias")
  #  ldist0 <- dist.list(ldata0)
  xynam <- names(ldata0)
  nvar <- length(x)
  if (missing(xydist))    {
    xydist <- ldist0 <- dist.list(ldata0)
  }    else {
    ldist0 <- xydist[xynam]
  }
  # print("2 calculando distancias")
  parar <- FALSE
  it <- 1
  basisx <- NULL
  npredictors <- length(ldata0)
  ipredictors <- numeric(npredictors-1)
  names(ipredictors) <- setdiff(names(ldata0),resp)
  
  nvar <- npredictors - 1
  if (missing(kbs)) 
    kbs <- - 1 #rep(-1,nvar)
  #names(kbs) <- names(ipredictors)
  basis.list <- FALSE
  if (is.null(basis.x)) {
    # print("base nula")
    #ncomp <- rep(4,length(ifunc))
    #names(ncomp) <- ifunc
    ncomp <- 4
    type.basis = "pc"
  }  else {
     # print("base usuario")
    if (is.list(basis.x)){ # base proporcionada por el usuario
      basis.list = TRUE
      basisx <- basis.x
      type.basis = basis.x[[1]]$type
    }else{
      # print("base vector")
      if (length(basis.x)!=2) stop("basis.x must be a vector with length two")
      type.basis = basis.x[1]
      ncomp <- as.numeric(basis.x[2])
    }
  }  
  tbasis <- type.basis
  if (type.basis %in% c("fourier","bspline"))     tbasis="basis"
  
  dcor=matrix(0,nrow= nvar,ncol= nvar)
  colnames(dcor)=c(names(ipredictors))
  rownames(dcor)=1:nvar
  
  #2- caclulo correlacion  de cada la distancia de la respuesta vs distancia del resto de objetos
  # print("2 calculando correlaciones")
  dist_resp <- dcor.y(ldist0,resp)
  dcor[it,names(dist_resp)] <- dist_resp*(dist_resp > dcor.min)
  n.edf <- length(ydat)
  fpredictors.nl <- ""
  form.nl <- paste(resp,"~",sep="")
  basis2 <- list()  
  ycen <- data$df[,y]-mean(data$df[,y])
  gof <- NULL
  anyfdata <- FALSE
  res.prev <- gam(as.formula(paste0(form.nl,1)),data=data$df,family=family)
  while (!parar){
      # print("3 Seleccion variable-Regresion")
      # print("bucle")    
    esfactor <- FALSE #SE UTILIZA PARA NO PONER s(factor)
    #3- seleccion dcor mas elevada
    nam <-  names(dist_resp)
    ind.xentra <- which.max(dist_resp)
    xentra <- nam[ind.xentra]
    #dd <- dcor.ttest(ldist0[[resp]],ldist0[[xentra]],distance=TRUE)
    dd <- dcor.test(ldist0[[resp]],ldist0[[xentra]],n=n.edf)
    #if (trace){      print(dd);      print(xentra)    }   
    if (verbose) {
      print(it);      print(n.edf);      print(nam)
      cat("Enter the variable ",xentra);      print(dd);      print(names(dd))
    }
    #if (is.null(basisx)) {     basisx <- list()    }
    if (dd$p.value > alpha ) {
      parar=TRUE
      if (trace) print("The algorithm ends because no variable is significant")
    }  
    else{
      if (dd$estimate < dcor.min ) {
        parar=TRUE
        if (trace) print("The algorithm ends because dcor < dcor.min")
      }
      else {
        rownames(dcor)[it] <- xentra
        if (trace)      cat("Covariate: ")
        if (trace)       print(xentra)
        #if (is.fdata(ldata0[[xentra]]) & !basis.list) {
        if (is.fdata(ldata0[[xentra]])) {
          # print("clase de base")
          par.basis.ipred <- list()
          anyfdata <- TRUE
          par.basis.ipred$fdataobj <- ldata0[[xentra]]
          
          
         #if (numbasis.opt | basis.list ){
     
          if ( basis.list ){  
            type.basis = basis.x[[xentra]]$type
            if (type.basis=="bspline" | type.basis=="fourier") ncomp <- basis.x[[xentra]]$nbasis
            if (type.basis=="pc" | type.basis=="pls")      ncomp <- length(basis.x[[xentra]]$basis)
            basisx[[xentra]] <- basis.x[[xentra]]
          }
          
          if (xentra %in% names(type.basis)){
            itype <- which(xentra== names(type.basis))
            type.basis.ipred <- type.basis[itype]
          }    else {
            type.basis.ipred <- type.basis[1]
          }
          tbasis <- type.basis.ipred
          if (type.basis.ipred=="fourier") tbasis <- "basis"
          if (type.basis.ipred=="bspline") tbasis <- "basis"
          
          
    
          
          nam <- "fregre.gsam.cv"
          par.basis.ipred$y <- resp#entra la etiqueta y no toda la variable como en el basis.cv o pc.cv
          par.basis.ipred$x <- xentra
          par.basis.ipred$data <- data
          if (numbasis.opt)      {
            res <-  fregre.gsam.cv(data,resp,xentra, alpha = alpha
                                   ,family=family
                                   #,type.basis=tbasis, kbs = kbs
                                   ,type.basis=type.basis, kbs = kbs
                                   ,numbasis = ncomp, numbasis.opt=numbasis.opt)  
            if (tbasis!="basis"){
              res$basis.x[[xentra]]$basis <- res$basis.x[[xentra]]$basis[res$numbasis.opt,]
            }
            res$basis.x[[xentra]]$l <- res$numbasis.opt
            basisx[[xentra]] <- res$basis.x[[xentra]]
            #basisb[[xentra]] <- res$basis.x[[xentra]]
            if (trace)   print("results of internal function fregre.gsam.cv")
            if (trace)   print(summary(res))
            # parar=TRUE
          }        else{
            # print("no numbasis.opt")
            if (!basis.list){
             # print("NO entra gsam.CV")
            switch(tbasis,
                   "pc"={
                     best.pc <- 1:ncomp#[xentra]
                     basis1 <- create.pc.basis(ldata0[[xentra]],best.pc) 
                   },
                   "pls"={
                     best.pc <- 1:ncomp#[xentra]
                     basis1 <- create.pls.basis(ldata0[[xentra]],ldata0[[resp]],best.pc)
                   },"basis"={
                     #            best.pc <- 1:kmax
                     #basis1 <- create.bspline.basis(ldata0[[xentra]]$rangeval,nbasis=ncomp)
                     basis1 <- create.fdata.basis(ldata0[[xentra]],l=1:ncomp,type.basis=type.basis,
                                                  rangeval=ldata0[[xentra]]$rangeval)
                     args(create.fdata.basis)
                  #   basis2 <- basis1
                   })
            basisx[[xentra]] <- basis1
          } else  basisx[[xentra]] <- basis.x[[xentra]]}
        }       
        
        if (!parar) {
          # print("no parar");          print(xentra)
          xentran <- c(xentran,xentra)
          ind.xentra2 <- which(xentra==names(ldist0))
          ldist0 <- ldist0[-ind.xentra2]
          #ipredictors[xentra] <- ipredictors[xentra]+1
          #4- contruccion del modelo para esta variable #consido un catalogo de 4 posibilidades (lineal/nolineal, funcional/scalar)
          if (is.factor(ldata0[[xentra]])) esfactor=TRUE                 
          fx=FALSE
          if (esfactor)      {
            fpredictors.nl <- xentra  
          }
          else         {   
            fpredictors.nl <- paste("s(",xentra,",k=",kbs,",","fx=",fx,")",
                                    sep="",collapse="+")
          }
          nam.model <- "fregre.gsam"          
          form.nl <- (paste(form.nl,"+",fpredictors.nl,sep=""))  
          par.model$formula <- as.formula(form.nl)
          par.model$data <- data
          par.model[["basis.x"]] <- basisx
          if (tbasis == "basis"){
              par.model[["basis.b"]] <- basisx
          }
          par.model[["family"]] <- family
          res.nl  <- do.call(nam.model,par.model) 
          mejora <- res.prev$aic > res.nl$aic
        #  print(res.nl);          print(it);          print(res.prev);          print(mejora);          print(res.prev$gcv.ubre);          print(res.nl$gcv.ubre)
          if (it>1 & mejora){
            aov <- anova(res.nl,res.prev,test="F")
            mejora <- aov[["Pr(>F)"]][2] < alpha
            if (length(mejora)==0)       mejora <- TRUE
            if (is.na(mejora)) mejora <- TRUE
          } 
          #   cat("mejora",res.prev$gcv.ubre, res.nl$gcv.ubre,res.prev$gcv.ubre > res.nl$gcv.ubre,"\n")      
          if (mejora){
            suma <- summary(res.nl)          
            dd <- res.nl$dims
            df <- dd[["p"]]
            edf <- dd[["N"]] - dd[["p"]]    
            #         sr2 <- sum(res.nl$residuals^2)/edf
            r2 <- 1 - sum(res.nl$residuals^2)/sum(ycen^2)
            if (n.edf.correction) n.edf <-  edf
            ldata0[[resp]] <- res.nl$residuals
            ldist0[[resp]] <- as.matrix(dist(ldata0[[resp]]), diag =TRUE, upper = TRUE,     p = 2)
            #2- caclulo correlacion  de cada la distancia de la respuesta vs distancia del resto de objetos
            if (it==(npredictors-1)) {
              parar=TRUE
              #gof <- rbindgof,c(suma$logLik,suma$BIC,suma$AIC,edf,r2))
              gof <- rbind(gof,c(AIC(res.nl),deviance(res.nl),res.nl$df.residual,suma$r.sq,suma$dev.expl,res.nl$gcv.ubre))
            }        else{
              it <- it+1
              # print("3 calculando correlaciones")
              dist_resp <- dcor.y(ldist0,resp)
              # LO PONE A 0
              dcor[it,names(dist_resp)]=dist_resp*(dist_resp > dcor.min)
              #  print(dcor[it,names(dist_resp)])
            }
            #     print(summary(res.nl))
            if (!parar){ 
              #            gof <- rbind(gof,drop(c(suma$logLik,suma$BIC,suma$AIC,edf,r2)))
              gof <- rbind(gof,c(AIC(res.nl),deviance(res.nl),
                                 res.nl$df.residual,suma$r.sq,
                                 suma$dev.expl,res.nl$gcv.ubre))
            }
            else { 
              cat("The variable ",xentra,"does not improve the fit of the model\n")
            }
            res.prev <- res.nl
          }   else{
            #print("NOOOOOOO MEJORAAAAAAAA")
            if (it==nvar) parar=TRUE
            res.nl <- res.prev
            xentran <- xentran[-length(xentran)]
            dist_resp[xentra]<-0
            if (all(dist_resp==0)) parar=TRUE
          }
        }
      }  } }
  gof <- data.frame(xentran,(gof))
  if (is.null(xentran)) {
    warning("No variable selected, a null model is estimated")    
    res.nl <- fregre.gsam(as.formula(paste(resp,"~1",sep="")),data=data,family=family)
    suma <- summary(res.nl)
    gof <- data.frame(rbind(c(1,AIC(res.nl),deviance(res.nl),
                              res.nl$df.residual,suma$r.sq,
                              suma$dev.expl,res.nl$gcv.ubre)))      
  }  
  if (length(gof)==1) gof <- numeric(7)
  names(gof) <- c("xentra","AIC","deviance","df.residual","r.sq","dev.expl","GCV.ubre")
  res.nl$gof <- gof
  res.nl$i.predictor = ipredictors
  res.nl$i.predictor[xentran] <- 1
  res.nl$ipredictors = xentran
  res.nl$xydist = xydist
  res.nl$dcor = dcor[1:(it-1),]
  return(res.nl)
}