R/gamlssVGD_23_12_21.R

Defines functions CV gamlssCV TGD VGD gamlssVGD

Documented in CV gamlssCV gamlssVGD TGD VGD

# ------------------------------------------------------------------------------
# this work is a rethinking on Training, Validation and Test data sets analysis
# gamlssVGD() is a combination of the original TGD()  TGD1() TGD2() functions
# it fits a gamlss model at the traning  data (rand==1) and calculates the global
# deviance for the validation set (rand==2)
# TO DO: a) It should also save the residuals for the validation test 
#         Now done the validated residuals are saved as "residVal"
#        b) gamlssCV() needs parallelization
#        c) need a function for comparing CV models    
#------------------------------------------------------------------------------
# functions
#-------------------------------------------------------------------------------
#   i) gamlssVGD() for fiting a model and then calculate the deviance for the 
#      extra data
#  ii) VGD() for comparing fitted gamlssVHD models
# iii) getTGD()  after fitting to training data  get the global deviance for 
#      the test data
#  iv) TDG() comparing fitted TGD objects
#   v) gamlssCV() for fitting k-folds cross validation
#  vi) CV() comparing fitted CV objects
#-------------------------------------------------------------------------------
#rand <- sample(2, 610, replace=T, prob=c(0.6,0.4)
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# VALIDATION: this is fitting a gamlss model on a sample from the original data 
#  and calculated the Validated Global Deviance from the new Validated data
gamlssVGD <-function(formula = NULL, 
               sigma.formula = ~1, 
                  nu.formula = ~1, 
                 tau.formula = ~1, 
                        data = NULL, # original data 
                      family = NO,  
                     control = gamlss.control(trace=FALSE),
                        rand = NULL, # 1 for training 2 for validation
                     newdata = NULL, 
                          ...)
 {
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# this is to replicate rqres within gamlssVGD enviroment
# it is used as in gamlss()
rqres <- function (pfun = "pNO", 
                   type = c("Continuous", "Discrete", "Mixed"),
               censored = NULL,  
                   ymin = NULL, 
                 mass.p = NULL, 
                prob.mp = NULL,
                      y = y,
                      ... )
  { }
  body(rqres) <-  eval(quote(body(rqres)), envir = getNamespace("gamlss"))
##------------------------------------------------------------------------------
##------------------------------------------------------------------------------  
##------------------------------------------------------------------------------
## main function starts here
##------------------------------------------------------------------------------
   sys_call <- sys.call() 
if (is.null(data))   stop("data should be set here")
if (is.null(rand)&&is.null(newdata)) stop("rand or newdata should be set")
if (!is.null(rand))
  {
  if ( any(!rand%in%c(1,2))) stop("rand values should be 1 or 2")
  dataTraining <- subset(data, rand==1)
     dataValid <- subset(data, rand==2) 
}  
       fname <- as.gamlss.family(family)
        dfun <- paste("d", fname$family[[1]],sep="")
        pfun <- paste("p", fname$family[[1]],sep="")
        lpar <- length(fname$parameters)
       dtype <- fname$type
     #  fname$rqres[[1]][["ymin"]]
        if (is.null(formula)) stop("no formula is set in gamlssVGD")        
        if (is.null(data)) stop("the data argument is needed in gamlssVGD")
# FIT MODEL + predict ----------------------------------------------------------
if (!is.null(rand))#  if `rand' is set do this ---------------------------------
 {
          m1 <- gamlss(formula=formula, sigma.formula = sigma.formula, 
                       nu.formula = nu.formula, tau.formula = tau.formula, 
                       data= dataTraining, family=family, control=control, ...) 
     nfitted <- predictAll(m1, newdata=dataValid, data=dataTraining)
dim1newdata  <- dim(dataValid)[1]
 } else
 {                   #if `newdata'  is set do this ----------------------------------
          m1 <- gamlss(formula = formula, sigma.formula = sigma.formula, 
                      nu.formula = nu.formula, tau.formula = tau.formula, 
                      data = data, family = family, control = control, ...)
     nfitted <- predictAll(m1, newdata=newdata, data=data)
dim1newdata  <- dim(newdata)[1]
 }
#------------------------------------------------------------------------------
# get  y for new data  if binomial       
if (fname$family[1] %in% .gamlss.bi.list)# if binomial
   {
     if (NCOL(nfitted$y) == 1) 
     {
       y1 <- nfitted$y 
       bd <- nfitted$bd
     }
     else                 
     {
       bd <- nfitted$y[,1] + nfitted$y[,2]
       y1 <- nfitted$y[,1]
     }
   } else
   {
     y1 <- nfitted$y 
   }
#------------------------------------------------------------------------------
# jump depending on the number of parameters       
if(lpar==1) 
   {
     if (fname$family[[1]] %in% .gamlss.bi.list)
     {
       devi <-  call(dfun, x=y1, mu =  nfitted$mu, bd=bd, log=TRUE) 
       ures <-  call("rqres", pfun=pfun, type=dtype,
                     ymin=fname$rqres[[1]][["ymin"]], 
                     y=y1, bd=bd, mu= nfitted$mu) 
     } else
     {
       devi <-  call(dfun, x=y1, mu =  nfitted$mu,  log=TRUE) 
       ures <-  call("rqres", pfun=pfun, type=dtype, 
                     ymin=fname$rqres[[1]][["ymin"]], 
                     y=y1, mu= nfitted$mu)
     } 
   }
else if(lpar==2)
   { 
     if (fname$family[[1]] %in% .gamlss.bi.list)
     {
       devi <-  call(dfun, x=y1, mu =  nfitted$mu, sigma= nfitted$sigma,  
                     bd=bd, log=TRUE) 
       ures <-  call("rqres", pfun=pfun, type=dtype, 
                     ymin=fname$rqres[[1]][["ymin"]], 
                     y=y1, mu= nfitted$mu, sigma= nfitted$sigma, bd=bd)
     } else
     {
       devi <-  call(dfun, x=y1, mu =  nfitted$mu, sigma =  nfitted$sigma, log=TRUE) 
       ures <- call("rqres", pfun=pfun,  type=dtype, 
                     ymin=fname$rqres[[1]][["ymin"]], 
                     y=y1, mu= nfitted$mu, sigma= nfitted$sigma )
     } 
   }
else if(lpar==3)
   {
     if (fname$family[[1]] %in% .gamlss.bi.list)
     {
       devi <- call(dfun, x=y1, mu =  nfitted$mu, sigma =  nfitted$sigma, 
                     nu =  nfitted$nu, bd=bd, log=TRUE)
       ures <- call("rqres", pfun=pfun,  type=dtype, 
                    ymin=fname$rqres[[1]][["ymin"]], 
                    y=y1, mu= nfitted$mu, sigma= nfitted$sigma,
                    nu =  nfitted$nu, bd=bd)
     } else
     {
       devi <-  call(dfun, x=y1, mu =  nfitted$mu, sigma =  nfitted$sigma, 
                     nu =  nfitted$nu, log=TRUE)
       ures <- call("rqres", pfun=pfun,  type=dtype, 
                    ymin=fname$rqres[[1]][["ymin"]], 
                    y=y1, mu= nfitted$mu, sigma= nfitted$sigma,
                    nu =  nfitted$nu)
     } 
   }
else 
   {
     if (fname$family[[1]] %in% .gamlss.bi.list)
     {
       devi <-  call(dfun, x=y1, mu =  nfitted$mu, sigma =  nfitted$sigma, 
                     nu =  nfitted$nu, tau= nfitted$tau, bd=bd, log=TRUE)
       ures <- call("rqres", pfun=pfun,  type=dtype, 
                    ymin=fname$rqres[[1]][["ymin"]], 
                    y=y1, mu= nfitted$mu, sigma= nfitted$sigma,
                    nu =  nfitted$nu, tau = nfitted$tau, bd=bd)
     } else
     {
       devi <-  call(dfun, x=y1, mu =  nfitted$mu, sigma =  nfitted$sigma,
                     nu =  nfitted$nu, tau =  nfitted$tau, log=TRUE)
       ures <- call("rqres", pfun=pfun,  type=dtype, 
                    ymin=fname$rqres[[1]][["ymin"]], 
                    y=y1, mu= nfitted$mu, sigma= nfitted$sigma,
                    nu =  nfitted$nu, tau= nfitted$tau)
      # ures <-  call(pfun, q=y1, mu =  nfitted$mu, sigma =  nfitted$sigma,nu =  nfitted$nu,tau =  nfitted$tau)
     } 
   }
         Vresid <- eval(ures)
       dev.incr <- -2 * eval(devi)
            dev <- sum(dev.incr)
         m1$VGD <- dev
     m1$IncrVGD <- dev.incr
m1$predictError <- dev/dim1newdata
    m1$residVal <- Vresid 
 m1$call$family <- sys_call$family
#   }#----------END of newdata -------------------------------------------------
 class(m1) <- c("gamlssVGD", "gamlss", "gam",    "glm",    "lm"  )
     m1
 } # end of function
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
VGD <- function(object,...) #UseMethod("AIC")
{
  # local function
  is.gamlssVGD <-  function (x)   inherits(x, "gamlssVGD")
  if (length(list(...))) 
  {
      object <- list(object, ...)
    isgamlss <- unlist(lapply(object, is.gamlssVGD))
    if (!any(isgamlss)) stop("some of the objects are not gamlssVGD")
         VGD <- as.numeric(lapply(object, function(x) x$VGD)) 
         val <- cbind(VGD)
        Call <- match.call()
   row.names <- as.character(Call[-1])
       o.val <- order(val)
         val <-  as.data.frame(val[o.val])
       o.r.n <-row.names[o.val]
rownames(val) <- o.r.n 
colnames(val) <- "Pred.GD"
  val
  }
  else 
  { val <- if (is.gamlssVGD(object)) object$VGD 
    else stop(paste("this is not a gamlssVGD object"))
    val 
  }
}
#-------------------------------------------------------------------------------    
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# this requires a fitted model and the new data only
getTGD<- function (object,   newdata = NULL, ...) 
{
  if (!is.gamlss(object)) stop("not a gamlss object")
  fname <- as.gamlss.family(object$family[[1]])
  dfun <- paste("d", fname$family[[1]], sep = "")
  pfun <- paste("p", fname$family[[1]],sep="")
  lpar <- length(fname$parameters)
  #lpar <- sum(unlist(fname$parameter))
  if (is.null(newdata)) 
    stop("no newdata is set in VGD")
   nfitted <- predictAll(object, newdata=newdata, ...)
  if (is.null(nfitted$y)) stop("the response variables is missing in the newdata")
  
  if (fname$family[1] %in% .gamlss.bi.list)# if binomial
  {
    if (NCOL(nfitted$y) == 1) 
    {
      y1 <- nfitted$y 
      bd <- nfitted$bd
    }
    else                 
    {
      bd <- nfitted$y[,1] + nfitted$y[,2]
      y1 <- nfitted$y[,1]
    }
  } else
  {
    y1 <- nfitted$y 
  }
  
  if(lpar==1) 
  {
    if (fname$family[[1]] %in% .gamlss.bi.list)
    {
      devi <-  call(dfun, x=y1, mu =  nfitted$mu, bd=bd, log=TRUE) 
      ures <-  call(pfun, q=y1, mu =  nfitted$mu, bd=bd)  
    } else
    {
      devi <-  call(dfun, x=y1, mu =  nfitted$mu,  log=TRUE) 
      ures <-  call(pfun, q=y1, mu =  nfitted$mu)   
    } 
  }
  else if(lpar==2)
  {
    if (fname$family[[1]] %in% .gamlss.bi.list)
    {
      devi <-  call(dfun, x=y1, mu =  nfitted$mu, sigma= nfitted$sigma,  bd=bd, log=TRUE) 
      ures <-  call(pfun, q=y1, mu =  nfitted$nmu, sigma= nfitted$sigma, bd=bd)  
    } else
    {
      devi <-  call(dfun, x=y1, mu =  nfitted$mu, sigma =  nfitted$sigma, log=TRUE) 
      ures <-  call(pfun, q=y1, mu =  nfitted$mu, sigma =  nfitted$sigma) 
    } 
  }
  else if(lpar==3)
  {
    if (fname$family[[1]] %in% .gamlss.bi.list)
    {
      devi <-  call(dfun, x=y1, mu =  nfitted$mu, sigma =  nfitted$sigma, nu =  nfitted$nu, bd=bd, log=TRUE)
      ures <-  call(pfun, q=y1, mu =  nfitted$mu, sigma =  nfitted$sigma, nu =  nfitted$nu, bd=bd) 
    } else
    {
      devi <-  call(dfun, x=y1, mu =  nfitted$mu, sigma =  nfitted$sigma, nu =  nfitted$nu, log=TRUE)
      ures <-  call(pfun, q=y1, mu =  nfitted$mu, sigma =  nfitted$sigma, nu =  nfitted$nu)
    } 
  }
  else 
  {
    if (fname$family[[1]] %in% .gamlss.bi.list)
    {
      devi <-  call(dfun, x=y1, mu =  nfitted$mu, sigma =  nfitted$sigma, nu =  nfitted$nu, tau= nfitted$tau, bd=bd, log=TRUE)
      ures <-  call(pfun, q=y1, mu =  nfitted$mu, sigma =  nfitted$sigma, nu =  nfitted$nu, tau= nfitted$tau, bd=bd)
    } else
    {
      devi <-  call(dfun, x=y1, mu =  nfitted$mu, sigma =  nfitted$sigma,nu =  nfitted$nu,tau =  nfitted$tau, log=TRUE)
      ures <-  call(pfun, q=y1, mu =  nfitted$mu, sigma =  nfitted$sigma,nu =  nfitted$nu,tau =  nfitted$tau)
    } 
  }
  Vresid <- qNO(eval(ures))
  dev <- -2*sum(eval(devi))
  out <-list()
  out <- list(TGD=dev,  predictError=dev/dim(newdata)[1], resid=Vresid)
  class(out) <- "gamlssTGD"
  out
}
#----------------------------------------------------------------------------------------    
TGD <- function(object,...) #UseMethod("AIC")
{
  # local function
  is.TGD <-  function (x)   inherits(x, "gamlssTGD")
  if (length(list(...))) 
  {
    object <- list(object, ...)
    isTGD <- unlist(lapply(object, is.TGD))
    if (!any(isTGD)) stop("some of the objects are not TGD")
    TGD <- as.numeric(lapply(object, function(x) x$TGD)) 
    val <- cbind(TGD)
    Call <- match.call()
    row.names <- as.character(Call[-1])
    o.val <- order(val)
    val  <-  as.data.frame(val[o.val])
    o.r.n <-row.names[o.val]
    rownames(val) <- o.r.n
    colnames(val) <- "Pred.GD"
    val
  }
  else 
  { val <- if (is.TGD(object)) object$TGD 
    else stop(paste("this is not a TGD object"))
    val 
  }
}

#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# gamlssCV0 <- function(formula = NULL, 
#                      sigma.formula =~1, 
#                         nu.formula =~1, 
#                        tau.formula =~1, 
#                               data = NULL, # original data 
#                             family = NO,  
#                            control = gamlss.control(trace=FALSE),
#                             K.fold = 10,
#                           set.seed = 123,
#                               rand = NULL) 
# {
#   if (is.null(data))   stop("data should be set here")
#         N <- dim(data)[1]
#   #RNAMES <- rownames(data)
#   residCV <- rep(0, N)
#  set.seed <- set.seed
#      rand <- if (is.null(rand)) sample (K.fold , N, replace=TRUE)
#            else rand
#   if (length(rand)!=N) stop("the length of the rand should be equal to data")
#        CV <- rep(0, K.fold)
#   for (i in sort(unique(rand)))
#   {
#     cat("fold ", i, "\n",sep="")
#     learn <- gamlssVGD(formula=formula, sigma.formula=sigma.formula, nu.formula=nu.formula, tau.formula=tau.formula,  family=family, control=control, data=data[rand!=i,], newdata=data[rand==i,])
#     residCV[rand==i] <-  learn$residVal
#                CV[i] <- learn$VGD
#   }
# out <- list(VGD=sum(CV), CVGD=CV, residCV=residCV) 
# class(out) <- "gamlssCV"
# out
# }
#------------------------------------------------------------------------------
gamlssCV <- function(formula = NULL, 
               sigma.formula = ~1, 
                  nu.formula = ~1, 
                 tau.formula = ~1, 
                        data = NULL, # original data 
                      family = NO,  
                     control = gamlss.control(trace=FALSE),
                      K.fold = 10,
                    set.seed = 123,
                        rand = NULL,
                    parallel = c("no", "multicore", "snow"), 
                       ncpus = 1L, 
                          cl = NULL, ...) 
{
#--------------- PARALLEL-------------------------------------------------------
#----------------SET UP PART----------------------------------------------------
if (missing(parallel)) 
    parallel <- "no"
    parallel <- match.arg(parallel)
     have_mc <- have_snow <- FALSE
if (parallel != "no" && ncpus > 1L) 
{
  if (parallel == "multicore") 
      have_mc <- .Platform$OS.type != "windows"
  else if (parallel == "snow") 
    have_snow <- TRUE
  if (!have_mc && !have_snow) 
        ncpus <- 1L
  loadNamespace("parallel")
}
# -------------- finish parallel------------------------------------------------
#-------------------------------------------------------------------------------
sys_call <- sys.call() 
if (is.null(data))   stop("data should be set here")
       N <- dim(data)[1]
# RNAMES <- rownames(data)
set.seed <- set.seed
    rand <- if (is.null(rand)) sample(K.fold , N, replace=TRUE)
            else rand
if (length(rand)!=N) stop("the length of the rand should be equal to data")
  K.fold <- length(unique(rand))
      CV <- rep(0, K.fold)
 residCV <- rep(0, N)
       i <- sort(unique(rand))
#---------------------------------------
fn <- function(i,...)
 {
    cat("fold ", i, "\n",sep="")
    learn <- gamlssVGD(formula=formula, sigma.formula=sigma.formula, nu.formula=nu.formula, tau.formula=tau.formula,  family=family, control=control, data=data[rand!=i,], newdata=data[rand==i,],...)
    # residCV[rand==i] <<-  learn$residVal
    list(CV=learn$VGD, resid=learn$residVal) 
 }
#pp<-vapply(i, fn, list("gc", "res"))
# ll<-lapply(i, fn)
# ss<-sapply(i, fn)
#========================================
# --------  parallel -----------------------------------------------------------
CV <- if (ncpus > 1L && (have_mc || have_snow)) 
{ 
  if (have_mc) 
  {# sapply(scope, fn)
      parallel::mclapply(i, fn, mc.cores = ncpus)
  }
  else if (have_snow) 
  {
      list(...)
   if (is.null(cl)) 
    {
# make the cluster
# cl <- parallel::makePSOCKcluster(rep("localhost", ncpus))
  cl <- parallel::makeForkCluster(ncpus)
    if (RNGkind()[1L] == "L'Ecuyer-CMRG") 
          parallel::clusterSetRNGStream(cl)
 res <- parallel::parLapply(cl, i, fn)
    parallel::stopCluster(cl)
  res
     }
    else t(parallel::parLapply(cl, i, fn))
    }
} # end parallel ---------------------------------------------------------------
else lapply(i, fn)
  cv <- rep(0, K.fold)
for (i in 1:K.fold)
  {
    residCV[rand==i] <-  CV[[i]]$resid
    cv[i] <- CV[[i]]$CV
  }
 # CV <-  sapply(i, fn) 
  out <- list(CV=sum(cv), allCV=cv, residCV=residCV) 
  class(out) <- "gamlssCV"
  out
}
#-------------------------------------------------------------------------------
CV <- function(object,...) #UseMethod("AIC")
{
  # local function
  is.CV <-  function (x)   inherits(x, "gamlssCV")
  if (length(list(...))) 
  {
    object <- list(object, ...)
    isCV <- unlist(lapply(object, is.CV))
    if (!any(isCV)) stop("some of the objects are not gamlssCV")
    CV <- as.numeric(lapply(object, function(x) x$CV)) 
    val <- cbind(CV)
    Call <- match.call()
    row.names <- as.character(Call[-1])
    o.val <- order(val)
    val  <-  as.data.frame(val[o.val])
    o.r.n <- row.names[o.val]
    rownames(val) <- o.r.n 
    val
  }
  else 
  { val <- if (is.CV(object)) object$CV 
    else stop(paste("this is not a gamlssCV object"))
    val 
  }
}

Try the gamlss package in your browser

Any scripts or data that you put into this service are public.

gamlss documentation built on May 29, 2024, 6:08 a.m.