R/blockedCV.R

Defines functions print.blockedCVpred print.blockedCV blockedCV

Documented in blockedCV print.blockedCV print.blockedCVpred

#' @title \emph{blockedCV}: run spatial blocked cross-validation on the integrated model.
#' 
#' @description This function is used to perform spatial blocked cross-validation with regards to model selection for the integrated model. It does so by leaving out a block of data in the full model, running a model with the remaining data, and then calculating the deviance information criteria (DIC) as a score of model fit.
#' @param data An object produced by either \code{\link{startISDM}} of \code{\link{startSpecies}}. Requires the slot function, \code{.$spatialBlock} to be run first in order to specify how the data in the model is blocked.
#' @param options A list of \pkg{INLA} or \pkg{inlabru} options to be used in the model. Defaults to \code{list()}.
#' @param method Which cross-validation method to perform. Must be one of \code{'DIC'} or \code{'Predict'}. If \code{'DIC'} then the DIC values for each block are obtained. If \code{'Predict'} then predictions are made on a dataset in the left out block. For this to work, please specify the argument \code{methodOptions}.
#' @param predictName Name of the dataset to predict onto if \code{method = 'Predict'}.
#' @param datasetCombs A list of vectors containing dataset combinations to include in each model run if \code{method = 'Prediction'}. If \code{NULL} then all combinations of the dataset will be estimated.
#' @import inlabru
#' @import stats
#' @importFrom utils combn
#' 
#' @examples 
#' 
#'\dontrun{
#'  if(requireNamespace('INLA')) {
#'    
#'  #Get Data
#'  data("SolitaryTinamou")
#'  proj <- "+proj=longlat +ellps=WGS84"
#'  data <- SolitaryTinamou$datasets
#'  mesh <- SolitaryTinamou$mesh
#'  mesh$crs <- proj
#'  
#'  #Set model up
#'  organizedData <- startISDM(data, Mesh = mesh,
#'                             responsePA = 'Present',
#'                             Projection = proj)
#'  
#'  #Set up spatial block
#'  organizedData$spatialBlock(k = 2, rows = 2, cols = 1)
#'  
#'  #Run spatial block cross-validation
#'  blocked <- blockedCV(organizedData)
#'  
#'  #Print summary
#'  blocked
#'    
#'  }
#'}
#' 
#' @return An object of class \code{blockedCV}, which is essentially a list of DIC values obtained from each iteration of the model.
#' 
#' @export
#' 
blockedCV <- function(data, options = list(),
                      method = 'DIC', predictName = NULL,
                      datasetCombs = NULL) {
  
  #How do we do this?
   #Should we make data a list of data files;
   # or should we make another argument for thinned formulas to test based on the full model?
  
  if (!inherits(data, 'dataSDM') && !inherits(data, 'specifySpecies') && !inherits(data, 'specifyISDM')) stop('data needs to be a dataSDM object.')
  
  if (is.null(data$.__enclos_env__$private$INLAmesh)) stop('An fm_mesh_2d object is required before any model is run.')
  
  if (!data$.__enclos_env__$private$blockedCV) stop('Please use ".$spatialBlock" before using this function.')
  
  data2ENV(data = data, env = environment())
  
  results <- list()
  
  if (!method %in% c('DIC', 'Predict')) stop('method needs to be one of "DIC" or "Predict".')
  
  if (method == 'Predict') {
    
    if (is.null(predictName)) stop('predictName cannot be NULL if method = "Predict".')
    
    if (!predictName %in% data$.__enclos_env__$private$dataSource) stop('predictData needs to be the name of a dataset added to the model.')
    ##CHANGE
     #Remove predict dataset here, and then add it as its own
    dataIn <- unique(data$.__enclos_env__$private$dataSource)
    
    if (!is.null(datasetCombs)) {
      
      if (!all(unique(unlist(datasetCombs)) %in% dataIn)) stop('Dataset added in datasetCombs not available.')
      
    } else datasetCombs <- do.call(c, lapply(seq_along(dataIn), combn, x = dataIn, simplify = FALSE))
    
  } else datasetCombs = list(data$.__enclos_env__$private$dataSource)
  
  block_index <- lapply(unlist(data$.__enclos_env__$private$modelData, recursive = FALSE), function(x) data.frame(x)[, '.__block_index__'])
  
  if (!is.null(data$.__enclos_env__$private$temporalName)) {
    
    numTime <- length(unique(unlist(data$.__enclos_env__$private$temporalVars)))
    
    newIPS <- rep(list(data$.__enclos_env__$private$IPS), numTime)
    
    newIPS <- do.call(rbind, newIPS)
    
    newIPS[, data$.__enclos_env__$private$temporalName] <- rep(1:numTime, each = nrow(data$.__enclos_env__$private$IPS))
    
    newIPS <- st_transform(newIPS, data$.__enclos_env__$private$Projection)
    
    data$.__enclos_env__$private$IPS <- newIPS
    
  }
  
  for (dataSub in 1:length(datasetCombs)) {
  
    dataToUse <- unique(datasetCombs[[dataSub]])
    
  for (fold in unique(unlist(block_index))) {
    
    ##Maybe make block_index in dataSDM as a list such that we can see which datasets are not in block i to easily remove them.
     #Get formula terms only after likelihood construction, and then thin components from there.
     #And also for the whole, control.family thing
    
    #Subset here based on dataSub
    trainData <- lapply(data$.__enclos_env__$private$modelData[dataToUse], function(data) {
      
      lapply(data, function(x) {
        
        data <- x[x$.__block_index__ != fold,]
        if (nrow(data) > 0) data
        else NULL
        
      })
      
      
    })
    sourceIN <- which(unique(data$.__enclos_env__$private$dataSource) %in% names(trainData))
    sourcePred <- which(data$.__enclos_env__$private$dataSource %in% predictName)
    comp_terms <- gsub('\\(.*$', '', data$.__enclos_env__$private$Components)
    
    ##Check if all copy + bias Main in here
    
    #modelData[predictName]
    
    if (method == 'Predict') {
    
      testData <- lapply(data$.__enclos_env__$private$modelData[predictName], function(data) {
      
      lapply(data, function(x) {
        
        data <- x[x$.__block_index__ == fold,]
        if (nrow(data) > 0) data
        else NULL
        
      })
      
    })
      
      #if testData dosen't cover all blocks stop
      
      testLike <- do.call(inlabru::like_list,
                          makeLhoods(data = testData, #Is this an sf dataset?
                                     formula = data$.__enclos_env__$private$Formulas[predictName],
                                     family = data$.__enclos_env__$private$Family[predictName],
                                     mesh = data$.__enclos_env__$private$INLAmesh,
                                     ips = data$.__enclos_env__$private$IPS,
                                     samplers = data$.__enclos_env__$private$Samplers,
                                     paresp = data$.__enclos_env__$private$responsePA,
                                     ntrialsvar = data$.__enclos_env__$private$trialsPA,
                                     markstrialsvar = data$.__enclos_env__$private$trialsMarks,
                                     speciesname = data$.__enclos_env__$private$speciesName,
                                     speciesindex = data$.__enclos_env__$private$speciesIndex))
      
      #Collapse into 1 sf
       #how? do.call(rbind, testData) or do.call(c, testData)
      
      #For comps: take environmental covariates: paste
       #if shared then keep
      if (!is.null(data$.__enclos_env__$private$Spatial)) {
       if (data$.__enclos_env__$private$Spatial == 'copy') spatRM <- TRUE
       else spatRM <- FALSE #What about if species copy?
      } else spatRM <- FALSE
      
      oldIn <- lapply(testLike, function(x) x$used$effect)
      
      #predComp <- comp_terms %in% formIn
      
      #if (spatRM) {
        
      #  compsChange <- data$.__enclos_env__$private$Components
      #  whichRM <- grepl(paste0(predictName, '_spatial'), compsChange)
      #  compsChange[whichRM] <- paste0(predictName, '_spatial(main = geometry, model = predict_field)')
      #  predict_field <- data$spatialFields$datasetFields[[1]]
        
      #} else compsChange <- data$.__enclos_env__$private$Components
      
      #compPreds <- formula(paste('~ - 1 +', paste(c(compsChange[predComp], 'olikhoodvar(main = olikhoodvar, model = "offset")'), collapse = ' + ')))
      
      for (lik in 1:length(testLike)) {
        
      testLike[[lik]]$used$effect <- c(paste0('testIntercept', lik), 'olikhoodvar')#c(testLike[[1]]$used$effect, 'olikhoodvar')
      
      }

    }

    trainLiks <- do.call(inlabru::like_list,
                 makeLhoods(data = trainData,
                 formula = data$.__enclos_env__$private$Formulas[sourceIN],
                 family = data$.__enclos_env__$private$Family[sourceIN],
                 mesh = data$.__enclos_env__$private$INLAmesh,
                 ips = data$.__enclos_env__$private$IPS,
                 samplers = data$.__enclos_env__$private$Samplers,
                 paresp = data$.__enclos_env__$private$responsePA,
                 ntrialsvar = data$.__enclos_env__$private$trialsPA,
                 markstrialsvar = data$.__enclos_env__$private$trialsMarks,
                 speciesname = data$.__enclos_env__$private$speciesName,
                 speciesindex = data$.__enclos_env__$private$speciesIndex))
      
    formula_terms <- unique(unlist(lapply(trainLiks, function(x) {
      
      if (!identical(unlist(x$used), character(0))) unlist(x$used)
      else labels(terms(x$formula))
      
    })))
    
    if (method == 'Predict') {
      
      if (any(grepl('_biasField', formula_terms)) & length(dataToUse) == 1) formula_terms <- formula_terms[!grepl('_biasField', formula_terms)]
    
      fams <- unlist(lapply(trainLiks, function(x) x$family)) == 'cp'

    if (!any(fams)) {
      
      ips <- data$.__enclos_env__$private$IPS
      ips$respIPS <- 0
      
      if (inherits(data, 'specifySpecies')) {
        
        if (!is.null(data$.__enclos_env__$private$speciesSpatial)) {
          
        if (data$.__enclos_env__$private$speciesSpatial == 'replicate') ips <- fmesher::fm_cprod(ips, data.frame(speciesSpatialGroup = 1:max(data$.__enclos_env__$private$speciesTable$index)))
        
        }
        if (!is.null(data$.__enclos_env__$private$Intercepts)) {
          
          if (data$.__enclos_env__$private$speciesIntercepts) ips <- fmesher::fm_cprod(ips, data.frame(specIntTermRem = 1:max(data$.__enclos_env__$private$speciesTable$index)))
          names(ips)[names(ips) == 'specIntTermRem'] <- data$.__enclos_env__$private$speciesName
        }
      } 
      
      ipsLike <- inlabru::like(formula = respIPS ~ .,
                                include = formula_terms, E = ips$weight,
                    family = 'poisson', data = ips)
      
      # trainLiks[['ips']] <- ipsLike
      uFam <- FALSE
    } else uFam <- FALSE
      
    } else uFam <- FALSE
    
    
    comp_keep <- comp_terms %in% formula_terms
    
    if (!all(comp_keep)) {
      
      if (!is.null(data$.__enclos_env__$private$Spatial)) {
      
      if (data$.__enclos_env__$private$Spatial != 'shared' | data$.__enclos_env__$private$biasCopy) {
        
        Main <- grepl('_spatial', data$.__enclos_env__$private$Components) & grepl('_field', data$.__enclos_env__$private$Components)
        if (!any(Main)) Main <- 'NOTMAIN'
        else Main <- sub("\\(.*", "", comp_terms[Main])
        
        MainBias <- grepl('_biasField', data$.__enclos_env__$private$Components) & grepl('_bias_field', data$.__enclos_env__$private$Components)
        if (!any(MainBias)) MainBias <- 'NOTALLMAINBIAS'
        else MainBias <- sub("\\(.*", "", comp_terms[MainBias])
        
        whichMissing <- union(data$.__enclos_env__$private$dataSource[!data$.__enclos_env__$private$dataSource %in% dataToUse], 
                              names(trainData)[sapply(unlist(trainData, recursive = F), is.null)])
        
        if(paste0(whichMissing[1],'_biasField') == MainBias | paste0(whichMissing[1],'_spatial') == Main) {
        
          warning('Main dataset or more than 2 datasets missing from the block with either pointsSpatial = "copy" or copyModel = TRUE for the bias field.\n Will choose the first available dataset to copy on.')
          
          thinnedComponents <- reduceComps(componentsOld =  formula(paste('~ - 1 +', paste(data$.__enclos_env__$private$Components, collapse = ' + '))),
                                            pointsCopy = ifelse(data$.__enclos_env__$private$Spatial == 'copy', 
                                                                TRUE, FALSE),
                                            biasCopy = data$.__enclos_env__$private$biasCopy,
                                            datasetName = whichMissing[1],
                                            reducedTerms = comp_terms[comp_keep])
          
        } else  thinnedComponents <- formula(paste('~ - 1 +', paste(data$.__enclos_env__$private$Components[comp_keep], collapse = ' + ')))
        
        
      } else thinnedComponents <- formula(paste('~ - 1 +', paste(data$.__enclos_env__$private$Components[comp_keep], collapse = ' + ')))
      
      } else thinnedComponents <- formula(paste('~ - 1 +', paste(data$.__enclos_env__$private$Components[comp_keep], collapse = ' + ')))
      
    }
    else thinnedComponents <- formula(paste('~ - 1 +', paste(data$.__enclos_env__$private$Components[comp_keep], collapse = ' + ')))

    foldOptions <- data$.__enclos_env__$private$optionsINLA
    
    fold_ind <- unique(unlist(block_index))[unique(unlist(block_index)) != fold]
    
    foldOptions$control.family <- foldOptions$control.family[sapply(unlist(data$.__enclos_env__$private$modelData, recursive = FALSE), 
                                                                    function(x) any(fold_ind %in% data.frame(x)[, '.__block_index__']))]

    #Subset fold options for each family
    sourceIN <- which(data$.__enclos_env__$private$dataSource %in% names(trainData))
    foldOptions$control.family <- foldOptions$control.family[sourceIN]
    
    optionsTrain <- append(options, foldOptions)
    
    if (uFam) optionsTrain$control.family <- append(optionsTrain$control.family, list(list(link = 'log')))
    
    ##Calculate DIC for just this model?
    trainedModel <- try(inlabru::bru(components = thinnedComponents,
                                 trainLiks,
                                 options = optionsTrain))
    
    ## -log(intensity)
    ## add an offset argument...
    
    if (method == 'DIC') {
    
    if (inherits(trainedModel, 'try-error')) {
      
      warning('Model failed for a block. Please change your block layout to ensure all datasets are included in all blocks')
      results[[paste0('DIC_fold_', fold)]] <- NA
    }
    else results[[paste0('DIC_fold_', fold)]] <- trainedModel$dic
    
    }
    else {
      
      if (inherits(trainedModel, 'try-error')) {
        
        warning('Model failed for a block. Please change your block layout to ensure all datasets are included in all blocks')
        #paste_dataset here
        results[[paste(dataToUse, collapse = ' and ')]][[paste0('fold',fold)]] <- NA
      
      }
      else {
        
        if (!is.null(data$.__enclos_env__$private$biasFormula)) biasFormlabels <- c(labels(terms(data$.__enclos_env__$private$biasFormula)), 'Bias__Effects__Comps')
        else biasFormlabels <- NULL

        for(pd in 1:length(testData[[1]])) {
          
        #covInPres <- testLike[[pd]]$used$effect
        #covInPres <- covInPres[!covInPres %in% c('olikhoodvar', biasFormlabels, grep('_biasField', covInPres))] #bias
          
        #IF species only include the species in the likelihood
          #Remove bias
          covInPres <- formula_terms
          covInPres <- covInPres[!covInPres %in% biasFormlabels]
          covInPres <- covInPres[!grepl('_biasField', covInPres)]
          
          if (sum(grepl('_spatial', covInPres)) > 1) {
            
            spatIn <- covInPres[grepl('_spatial', covInPres)]
            if (paste0(predictName,'_spatial') %in% spatIn) covInPres <- covInPres[!covInPres %in% paste0(dataToUse[!dataToUse%in% predictName],'_spatial')]
            else {
              if (any(unlist(data$.__enclos_env__$private$Family)[names(data$.__enclos_env__$private$Family) %in% dataToUse] %in% c('poisson', 'binomial'))) {
                
                keepSpat <- unlist(data$.__enclos_env__$private$Family)[names(data$.__enclos_env__$private$Family) %in% dataToUse] %in% c('poisson', 'binomial')
                keepSpat <- names(unlist(data$.__enclos_env__$private$Family)[names(data$.__enclos_env__$private$Family) %in% dataToUse])[keepSpat][1]
                covInPres <- covInPres[!covInPres %in% paste0(dataToUse[!dataToUse%in% keepSpat],'_spatial')]
                
              } else {
                
                keepSpat <- names(unlist(data$.__enclos_env__$private$Family)[names(data$.__enclos_env__$private$Family) %in% dataToUse])[1]
                covInPres <- covInPres[!covInPres %in% paste0(dataToUse[!dataToUse%in% keepSpat],'_spatial')]
                
              }
              
            }
            
          }
          
          if (sum(grepl('_intercept', covInPres)) > 1) {
            
            intInt <- covInPres[grepl('_intercept', covInPres)]
            if (paste0(predictName,'_intercept') %in% intInt)covInPres <- covInPres[!covInPres %in% paste0(dataToUse[!dataToUse%in% predictName],'_intercept')]
            else {
              if (any(unlist(data$.__enclos_env__$private$Family)[names(data$.__enclos_env__$private$Family) %in% dataToUse] %in% c('poisson', 'binomial'))) {
                
                keepInt <- unlist(data$.__enclos_env__$private$Family)[names(data$.__enclos_env__$private$Family) %in% dataToUse] %in% c('poisson', 'binomial')
                keepInt <- names(unlist(data$.__enclos_env__$private$Family)[names(data$.__enclos_env__$private$Family) %in% dataToUse])[keepInt][1]
                covInPres <- covInPres[!covInPres %in% paste0(dataToUse[!dataToUse%in% keepInt],'_intercept')]
                
              } else {
                
                keepInt <- names(unlist(data$.__enclos_env__$private$Family)[names(data$.__enclos_env__$private$Family) %in% dataToUse])[1]
                covInPres <- covInPres[!covInPres %in% paste0(dataToUse[!dataToUse%in% keepInt],'_intercept')]
                
              }
              
            }
            
          }
          ##grepl(_spatial, covInPres) <- if bigger than 1 <- then keep only predict and PA/Counts BUT KEEP shared_spatial
          ##grepl(_intercept, covInPres) <- then keep only predict and PA/Counts

          #if not shared spatial
          #Select predData intercept
          #Else if not in, select first non PO
          #Else select PO
          
          #Same for intercept
          
        if (!is.null(data$.__enclos_env__$private$speciesName)) {
          
          likeSpec <- unique(testLike[[pd]]$data[[paste0(data$.__enclos_env__$private$speciesName, 'INDEX_VAR')]])
          notSpec <- unique(unlist(data$.__enclos_env__$private$speciesIn))
          notSpec <- notSpec[notSpec != likeSpec]
          
          if (data$.__enclos_env__$private$speciesEnvironment) {
            
            specCov <- apply(expand.grid(paste0(notSpec,'_'), data$.__enclos_env__$private$spatcovsNames), MARGIN = 1, FUN = paste0,collapse = '')
            specCov <- c(specCov, paste0(notSpec, '_Fixed__Effects__Comps'))
            covInPres <- covInPres[!covInPres %in% specCov]
            
          }
          
          if (!is.null(data$.__enclos_env__$private$speciesIntercepts)) {
          
            if (!data$.__enclos_env__$private$speciesIntercepts)  covInPres <- covInPres[!covInPres %in% paste0(notSpec,'_intercept')]
          
          }
          
          if (!is.null(data$.__enclos_env__$private$speciesSpatial)) {
            
            if (data$.__enclos_env__$private$speciesSpatial == 'copy') {
              
              rmSpat <-  grepl(paste0(notSpec, collapse = '|'), covInPres) & grepl('_spatial', covInPres) 
              covInPres <- covInPres[!rmSpat]
              
            }
            
          }
          
        }
            
          #if bias
          
        #covInPres <- intersect(oldIn[[pd]], formula_terms)
        ##Get all old vars in test like and thin with formula terms
        predForm <- formula(paste0('~(', paste(covInPres, collapse = ' + '), ')'))
          
        #Change this to testLike[[pd]]$data
        testPredicts <- suppressWarnings(predict(trainedModel,testLike[[pd]]$data, formula = predForm)) # testData[[1]][[pd]]
        #IF cp add log(1) or NA to ipoints?
        #if (testLike[[pd]]$family == 'cp') {
          
        #  nIPS <- nrow(data$.__enclos_env__$private$IPS)
        #  testLike[[pd]]$data$olikhoodvar <- c(testPredicts$mean, rep(0, nIPS))
          
        #} else 
        testLike[[pd]]$data$olikhoodvar <- testPredicts$mean
        
        }
        
        #Need to look at formula, and get the correct comps here
        foldOptions <- data$.__enclos_env__$private$optionsINLA
        foldOptions$control.family <- foldOptions$control.family[sourcePred]
        
        optionsTest <- append(options, foldOptions)
        compsIntercepts <- paste0('testIntercept', 1:length(testData[[1]]),'(1)')
        compPreds <- formula(paste0('~ - 1 + olikhoodvar(main = olikhoodvar, model = "offset") + ', paste0(compsIntercepts, collapse = ' + ')))

        testModel <- try(inlabru::bru(components = compPreds,
                                   testLike, options = optionsTest))
        
        if (inherits(testModel, 'try-error')) results[[paste(dataToUse, collapse = ' and ')]][[paste0('fold',fold)]] <- NA
        else results[[paste(dataToUse, collapse = ' and ')]][[paste0('fold',fold)]] <- testModel$mlik[[1]] #mean(testModel$residuals$deviance.residuals)##trainedModel$mlik[[1]] - 
        
        
      }
    
    }
  }
    
  }
  
  comps <- formula(paste0(' ~ ', paste0(gsub('\\(.*$', '', data$.__enclos_env__$private$Components), collapse = ' + ')))
  
  if (method == 'DIC') {
  
  results <- append(results, list(Formula = comps))
  class(results) <- c('blockedCV', 'list')
  
  }
  else {
    
    class(results) <- c('blockedCVpred', 'list')
    
  }
  results

}


#' Export class blockedCV
#' 
#' @export

setClass('blockedCV')

#' Print for blockedCV
#' 
#' @export print.blockedCV

#' Export print.blockedCV
#' @title Print function for \code{blockedCV}.
#' @param x A blockedCV object.
#' @param ... Unused argument.
#' 
#' @exportS3Method 

print.blockedCV <- function(x, ...) {
  
  cat('Spatial block cross-validation score:')
  cat('\n\n')
  cat('Formula: ')

  cat(deparse1(x$Formula))
  cat('\n\n')
  x$Formula <- NULL
  mean.deviance <- sapply(x, function(y) y$mean.deviance)
  p.eff <- sapply(x, function(y) y$p.eff)
  dic <- sapply(x, function(y) y$dic)
  
  dataobj <- data.frame(mean.deviance = mean.deviance,
                        p.eff = p.eff,
                        dic = dic)
  
  row.names(dataobj) <- paste0('fold ', 1:nrow(dataobj))
  
  print.data.frame(dataobj)
  
  cat('\nmean DIC score: ')
  cat(mean(dataobj$dic, na.rm = TRUE))


}

#' Export class blockedCVpred
#' 
#' @export

setClass('blockedCVpred')

#' Print for blockedCVpred
#' 
#' @export print.blockedCVpred

#' Export print.blockedCVpred
#' @title Print function for \code{blockedCV}.
#' @param x A blockedCV object.
#' @param ... Unused argument.
#' 
#' @exportS3Method 

print.blockedCVpred <- function(x, ...) {
  
  cat('Spatial block cross-validation score:')
  cat('\n\n')
  cat('Log marginal likelihoods obtained by using the predictions from the model as an offset:\n\n')

  foldFrame <- do.call(rbind.data.frame, x)
  foldFrame <- data.frame(foldFrame, mean = rowMeans(foldFrame))
  colnames(foldFrame) <- c(paste0('fold ', 1:(ncol(foldFrame) - 1)), 'mean')
  
  #foldFrame <- data.frame(Data.included = row.names(foldFrame),
  #                        foldFrame)
  #names(foldFrame)[1] <- 'Data included'
  #row.names(foldFrame) <- NULL
  print.data.frame(foldFrame)
  
}
PhilipMostert/inlabruSDMs documentation built on April 14, 2025, 11:39 a.m.