R/lemXvFun.R
In localEM: Nonparametric Modelling of Spatial Risk for Areal Disease Data

Documented in lemXv

#' @title Computes the likelihood cross-validation of the raster partitions
#'
#' @description The \code{lemXv} function computes the likelihood cross-validation scores for the observed data with the input bandwidths used for the smoothing matrix. The cross-validation test and training datasets of the observed cases are generated by k-fold sampling without replacement.
#'
#' @param cases Spatial polygons with case data
#' @param population Spatial polygons with population data
#' @param cellsCoarse Minimum resolution for rasterization of case data for numerical accuracy of smoothing matrix
#' @param cellsFine Minimum resolution for rasterization of population data for numerical integration of smoothing matrix
#' @param bw Vector of bandwidths
#' @param fact Aggregation factor prior to 'final step' smoothing (set to zero to skip final step)
#' @param xv Number of cross-validation datasets
#' @param lemObjects List of arrays for the smoothing matrix, and raster stacks for the partition and smoothed offsets
#' @param ncores Number of cores/threads for parallel processing
#' @param iterations List of convergence tolerance, number of iterations, and use of gpuR package for running local-EM recursions
#' @param randomSeed Seed for random number generator
#' @param path Folder for storing rasters
#' @param filename Filename (must have .grd extension) of the risk estimation
#' @param verbose Verbose output
#'
#' @details After using the \code{lemXv} function, a raster stack containing the IDs for the partitions is created by overlaying the spatial polygons of the case and population data. The smoothed offsets and smoothing matrix are computed for the specified bandwidths of each cross-validation set.
#'
#' @return The \code{lemXv} function returns a data frame of specified bandwidths and their cross-validation scores, and if specified, a raster of the risk estimation of the bandwidth with the lowest cross-validation score.
#'
#' @import raster sp Matrix
#' @useDynLib localEM
#' @export
lemXv = function(
    cases,
    population,
    cellsCoarse,
    cellsFine,
    bw,
	fact = 1,
    xv = 4,
    lemObjects,
    ncores = 1,
    iterations = list(tol = 1e-5, maxIter = 1000, gpu = FALSE),
    randomSeed = NULL,
    path = getwd(),
	filename = 'riskXv.grd',
    verbose = FALSE
){

  dir.create(path, showWarnings = FALSE, recursive = TRUE)

	if(!(length(grep("\\.gr[id]$", filename)))){
		warning("filename should have .grd extension")
	}

	if(!length(grep('/', filename))) {
		riskFile = file.path(path, filename)
	}

  # warning messages
  if(missing(lemObjects)) {
    if(class(cases) != 'SpatialPolygonsDataFrame') {
      stop("spatial polygons for case data must be provided if smoothing matrix not supplied")
    }

    if(class(cases) == 'SpatialPolygonsDataFrame') {

      #names of interest for regions in the coarse shapefile
      countcol = grep('^(count|cases)', names(cases), value=TRUE, ignore.case=TRUE)

      if(!length(countcol)) {
        stop("column names of case data must start with 'count' or 'cases'")
      }
    }
  }

  # initializing parallel features (if specified)
  if(ncores > 1) {
    theCluster = parallel::makeCluster(spec=ncores, type='PSOCK', methods=TRUE)
    parallel::setDefaultCluster(theCluster)
    parallel::clusterEvalQ(theCluster, library('raster'))
  } else {
    theCluster = NULL
  }

  # for cross-validation scores
  if(!is.null(randomSeed)) {
    set.seed(randomSeed)
  }

  # parameter specifications (if not supplied)
  if(!length(iterations$tol)) iterations$tol = 1e-6
  if(!length(iterations$maxIter)) iterations$maxIter = 2000
  if(!length(iterations$gpu)) iterations$gpu = FALSE

  if(missing(lemObjects)) {
    if(verbose) {
      cat(date(), "\n")
      cat("computing smoothing matrix\n")
    }

    ##raster partition
    xvLemRaster = rasterPartition(
        polyCoarse = cases,
        polyFine = population,
        cellsCoarse = cellsCoarse,
        cellsFine = cellsFine,
        xv = xv,
        bw = bw,
        ncores = theCluster,
        path = path,
        idFile = 'idXv.grd',
        offsetFile = 'offsetXv.grd',
        verbose = verbose)

  # restart cluster
  if(ncores > 1 & !is.null(theCluster)) {
    parallel::stopCluster(theCluster)

    theCluster = parallel::makeCluster(spec=ncores, type='PSOCK', methods=TRUE)
    parallel::setDefaultCluster(theCluster)
    parallel::clusterEvalQ(theCluster, library('raster'))
  }
    # smoothing matrix
    xvSmoothMat =  smoothingMatrix(
        rasterObjects = xvLemRaster,
        ncores = theCluster,
        path = path,
        filename = 'smoothMatXv.grd',
        verbose = verbose)


    # save smoothing matrix, useful in case of failure later on
    saveRDS(xvSmoothMat, file = file.path(path, 'smoothingMatrix.rds'))
    # xvSmoothMat = readRDS(file.path(path, 'smoothingMatrix.rds'))
    xvMat = xvSmoothMat$xv

  } else {
    if(verbose) {
      cat(date(), "\n")
      cat("using supplied smoothing matrix\n")
    }

    xvSmoothMat = lemObjects
    xvMat = lemObjects$xv
  }


  # checking structure of case data
  if(is.vector(cases)) {
    cases = data.frame(cases=cases)
  }
  if(is.matrix(cases)) {
    cases = as.data.frame(cases)
  }
  if(class(cases) == 'SpatialPolygonsDataFrame') {
    polyCoarse = cases
    cases = cases@data
  } else {
    polyCoarse = NULL
  }

  #names of interest for regions in the coarse shapefile
  countcol = grep('^(count|cases)', names(cases), value=TRUE, ignore.case=TRUE)
  # if(!length(countcol)){
  # countcol = grep("^(id|name)", names(cases), invert=TRUE, value=TRUE)[1]
  # }

  if(verbose) {
    cat(date(), "\n")
    cat("running local-EM for validation sets\n")
  }
  # estimate risk (by partition, not continuous) for each bw/cv combination

  forMoreArgs = list(
      x=cases[,countcol, drop=FALSE],
      lemObjects = xvSmoothMat,
      tol = iterations$tol,
      maxIter = iterations$maxIter,
      gpu = iterations$gpu,
      verbose=verbose)

  if(!is.null(theCluster)) {
    estList = parallel::clusterMap(
        theCluster,
        finalLemIter,
        bw = xvSmoothMat$bw,
        MoreArgs = forMoreArgs,
        SIMPLIFY=FALSE
    )

  } else {
    estList = mapply(
        finalLemIter,
        bw = xvSmoothMat$bw,
        MoreArgs = forMoreArgs,
        SIMPLIFY=FALSE
    )
  }

  if(any(unlist(lapply(estList, class)) == 'try-error') ) {
    warning("errors in local-em estimation")
  }

  estListExp = try(lapply(estList, function(x) x$expected))

  estListRisk = lapply(estList, function(x) x$risk)

  estDf = as.matrix(do.call(cbind, estListExp))
  riskDf = as.matrix(do.call(cbind, estListRisk))
  colnames(estDf) = colnames(riskDf) = paste(
      rep(names(estList), unlist(lapply(estListExp, function(xx) dim(xx)[2]))),
      unlist(lapply(
              estListExp, colnames
          )), sep = '_')
  rownames(riskDf) = colnames(xvSmoothMat$regionMat)

  if(verbose) {
    cat("computing CV scores\n")
  }
  # compute the CV scores

  # expected counts in left out regions
  xvEst = estDf[,grep("xv[[:digit:]]+", colnames(estDf))]
  #Sxv = gsub("^bw[[:digit:]]+xv|_(count|cases|count[[:digit:]]|cases[[:digit:]])+?$|_$", "", colnames(xvEst))
  Sxv = substr(colnames(xvEst) , regexpr("xv",colnames(xvEst))+2, regexpr("_",colnames(xvEst))-1)
  Scount = gsub("bw[[:digit:]]+xv[[:digit:]]+_", "", colnames(xvEst))
  if(all(nchar(Scount)==0)) Scount = rep_len(countcol, length(Scount))
  Sbw = gsub('^bw|xv.*', "", colnames(xvEst))

  suppressMessages(xvEstMask <- xvMat[,Sxv] * xvEst)
  # observed counts in left out regions
  suppressMessages(xvObs <- xvMat[,Sxv] * as.matrix(cases[,Scount]))

  logProbCoarse = stats::dpois(as.matrix(xvObs), as.matrix(xvEstMask), log=TRUE)
  
  badVec = which(apply(logProbCoarse, 1, function(xx) any(is.na(xx))))
    if(length(badVec)) {
      
      warning(paste("regions containing NA: ", 
                    paste(badVec, collapse=', ')), "\ntreating all NA's as 0")
    }
  
  Sbad  = c()
  i = 0
  for(i in 1:length(logProbCoarse)){
    if(is.na(as.vector(logProbCoarse[i])) == "TRUE")
      Sbad = append(Sbad, i) 
  }
  logProbCoarse[Sbad] = 0
  
  
  logProbCoarse[is.infinite(logProbCoarse)] = 0
  logProb = apply(logProbCoarse, 2, sum)

  xvFullMat = list(
    logProb = logProbCoarse, obs = xvObs, exp = xvEstMask,
    xvMat = xvMat, Sxv = Sxv, xvEst = xvEst, cases = cases, Scount = Scount
  )
  
  logProbFull = data.frame(
      bw = as.numeric(Sbw),
      cases = Scount,
      fold = Sxv,
      minusLogProb = -logProb
  )

  xvRes = stats::aggregate(
      logProbFull[,'minusLogProb'],
      as.list(logProbFull[,c('bw','cases')]),
      sum,
      na.rm = TRUE
  )
  xvRes = stats::reshape(
      xvRes, direction = 'wide',
      idvar = 'bw',
      timevar = 'cases'
  )
  colnames(xvRes) = gsub("^x[.]", "", colnames(xvRes))
  xvRes = xvRes[,c('bw',countcol)]
  minXvScore = apply(xvRes[,countcol, drop=FALSE],2,min, na.rm=TRUE)
  xvRes[,countcol] = xvRes[,countcol] -
      matrix(minXvScore, nrow=nrow(xvRes), ncol=length(minXvScore), byrow=TRUE)

#  stuff <<- list(xvSmoothMat$rasterFine, riskDf)
  newDf <- riskDf[
      as.character(raster::levels(xvSmoothMat$rasterFine)[[1]]$partition),]

  riskRaster = xvSmoothMat$rasterFine
  levels(riskRaster)[[1]] =  as.data.frame(cbind(
          ID = raster::levels(xvSmoothMat$rasterFine)[[1]]$ID,
          newDf))

  result = list(
      xv = xvRes,
      xvFull = logProbFull,
      riskAll = riskRaster,
      smoothingMatrix = xvSmoothMat,
      expected = polyCoarse,
      folds = xvMat,
      extra = xvFullMat
  )

  # estimate continuous risk at high resolution (if specified)
  if(fact > 0) {

    if(verbose) {
      cat(date(), "\n")
      cat("final smoothing step for bw with lowest CV score\n")
    }

    bwMin = result$xv[apply(result$xv[,colnames(result$xv)[-1]],2,which.min),'bw']
    Slayers = paste('bw', format(bwMin,scientific = FALSE), '_', countcol, sep = '')

    result$riskEst = finalSmooth(
        x = result,
        Slayers = Slayers,
		fact = fact,
        filename = riskFile,
        ncores = theCluster
    )
#   names(result$riskEst) = Slayers

    result$bw = paste('bw', bwMin, sep = '')
  }

  # done with the cluster
  if(!is.null(theCluster))
    parallel::stopCluster(theCluster)

  if(verbose) {
    cat(date(), "\n")
    cat("done\n")
  }

  return(result)
}


#' @export
lemXvMulti = function(
  cases, 
  population, 
  cellsCoarse,
  cellsFine,
  bw,
  fact = 1,
  xv = 4,
  lemObjects, 
  ncores = 1, 
  iterations = list(tol = 1e-5, maxIter = 1000, gpu = FALSE), 
  randomSeed = NULL, 
  path = getwd(), 
  filename = 'riskXv.grd', 
  verbose = FALSE
){
  
  if(verbose) {
    cat(date(), '\n')
    cat('running local-EM for validation sets for all maps\n')
  }
  
  resList = list()
  lambdaList = list()
  for(inM in 1:length(cases)) {
    
    ## partition-level risk estimation
    if(is.null(randomSeed)) {
      randomSeed = inM
    }
    
    lemXvMap = lemXv(
      cases = cases[[inM]], 
      fact = 0, 
      lemObjects = lemObjects[[inM]], 
      ncores = ncores, 
      iterations = iterations, 
      randomSeed = randomSeed, 
      path = path, 
      filename = paste0(gsub('.grd', '', filename), inM, '.grd'), 
      verbose = verbose)
    
    resList[[inM]] = lemXvMap
    
    lambdaRasterMap = raster::deratify(lemXvMap$riskAll)
    names(lambdaRasterMap) = paste0(names(lambdaRasterMap), '_', inM)
    
    lambdaList[[inM]] = lambdaRasterMap
  }
  
  lambdaRasterStack = stack(lambdaList)
  lambdaRaster = raster::stackApply(lambdaRasterStack, 
                                    indices = gsub('_[[:digit:]]+', '', names(lambdaRasterStack)), 
                                    fun = 'sum', na.rm = TRUE)
  names(lambdaRaster) = gsub('index_', '', names(lambdaRaster))
  
  if(verbose) {
    cat(date(), '\n')
    cat('computing cross-validation scores\n')
  }
  
  xvList = list()
  logProbFull = NULL
  for(inM in 1:length(cases)) {
    
    ## risk estimation for cross-validation regions
    lemXvMap = resList[[inM]]
    
    xvMatMap = lemXvMap$smoothingMatrix$xv
    
    xvList[[inM]] = xvMatMap
    
    # Nxv = dim(xvMatMap)[2] / length(cases)
    
    idMatMap = as.numeric(dimnames(xvMatMap)[[2]]) - 1
    # idMatMap = (idMatMap %/% Nxv) + 1
    idMatMap = (idMatMap %% length(cases)) + 1
    idMatMap = which(idMatMap == inM)
    
    regionMatMap = lemXvMap$smoothingMatrix$regionMat
    
    offsetMatMap = lemXvMap$smoothingMatrix$offsetMat[['offset']]
    offsetMatMap = offsetMatMap[colnames(regionMatMap), colnames(regionMatMap)]
    
    partitionAreasMatMap = Matrix::Diagonal(
      length(lemXvMap$smoothingMatrix$partitionAreas),
      lemXvMap$smoothingMatrix$partitionAreas)
    dimnames(partitionAreasMatMap) = list(
      names(lemXvMap$smoothingMatrix$partitionAreas),
      names(lemXvMap$smoothingMatrix$partitionAreas)
    )
    partitionAreasMatMap = partitionAreasMatMap[colnames(regionMatMap), colnames(regionMatMap)]
    
    riskRasterMap = stack(
      raster::deratify(lemXvMap$smoothingMatrix$rasterFine)[['partition']],
      lambdaRaster[[grep('xv', names(lambdaRaster))]])
    riskRasterMap = raster::mask(riskRasterMap, lemXvMap$smoothingMatrix$rasterFine)
    
    littleLambdaMap = as.data.frame(stats::na.omit(raster::unique(riskRasterMap)))
    
    duplLambdaMap = unique(littleLambdaMap$partition[duplicated(littleLambdaMap$partition)])
    duplLambdaMap = littleLambdaMap$partition %in% duplLambdaMap
    zeroLambdaMap = apply(littleLambdaMap[,grep('bw', names(littleLambdaMap))] == 0, 1, all)
    
    littleLambdaMap = littleLambdaMap[!(duplLambdaMap & zeroLambdaMap),]
    littleLambdaMap = aggregate(
      littleLambdaMap[,grep('bw', names(littleLambdaMap))],
      by = list(partition = littleLambdaMap$partition),
      FUN = 'mean', na.rm = TRUE)
    
    bigLambdaMap = partitionAreasMatMap %*%
      as.matrix(littleLambdaMap[,grep('bw', names(littleLambdaMap))])
    
    ## cross-validation scores    
    countcol = grep('count', names(cases[[inM]]), value = TRUE)
    
    expectedCoarseMap = as.matrix(regionMatMap %*% offsetMatMap %*% bigLambdaMap)
    
    obsCountsMap = getObsCounts(
      x = as.matrix(data.frame(cases[[inM]])[,countcol]), 
      polyCoarse = cases[[inM]], 
      regionMat = regionMatMap)
    obsCountsMap = matrix(obsCountsMap, 
                          nrow = nrow(expectedCoarseMap), ncol = ncol(expectedCoarseMap), 
                          dimnames = dimnames(expectedCoarseMap))
    
    Sxv = as.numeric(
      substr(colnames(obsCountsMap), 
             regexpr('xv', colnames(obsCountsMap)) + 2, regexpr('_', colnames(obsCountsMap)) - 1))
    
    expectedCoarseMap = expectedCoarseMap[,Sxv %in% idMatMap]
    obsCountsMap = obsCountsMap[,Sxv %in% idMatMap]
    
    for(inX in idMatMap) {
      
      expectedCoarseMap[!xvMatMap[,inX], grep(paste0('xv', inX), colnames(expectedCoarseMap))] = 0
      obsCountsMap[!xvMatMap[,inX], grep(paste0('xv', inX), colnames(obsCountsMap))] = 0
    }
    
    logProbCoarseMap = stats::dpois(as.matrix(obsCountsMap), 
                                    as.matrix(expectedCoarseMap), 
                                    log = TRUE)
    logProbCoarseMap[is.infinite(logProbCoarseMap)] = 0
    logProbMap = apply(logProbCoarseMap, 2, sum)
    
    Sbw = as.numeric(gsub('^bw|(xv[[:digit:]]+_[[:alnum:]]+$)', '', colnames(obsCountsMap)))
    Scount = gsub('bw[[:digit:]]+xv[[:digit:]]+_', '', colnames(obsCountsMap))
    Sxv = paste0(
      substr(colnames(obsCountsMap), 
             regexpr('xv', colnames(obsCountsMap)) + 2, regexpr('_', colnames(obsCountsMap)) - 1), 
      '_', inM)
    
    logProbFull = rbind(logProbFull, 
                        data.frame(bw = Sbw, cases = Scount, fold = Sxv, minusLogProb = -logProbMap))
  }
  rownames(logProbFull) = NULL
  
  xvRes = stats::aggregate(
    logProbFull[,'minusLogProb'],
    as.list(logProbFull[,c('bw','cases')]),
    sum,
    na.rm = TRUE)
  xvRes = stats::reshape(
    xvRes, direction = 'wide',
    idvar = 'bw',
    timevar = 'cases')
  
  colnames(xvRes) = gsub('^x[.]', '', colnames(xvRes))
  xvRes = xvRes[,c('bw',countcol)]
  minXvScore = apply(xvRes[,countcol, drop=FALSE], 2, min, na.rm = TRUE)
  xvRes[,countcol] = xvRes[,countcol] -
    matrix(minXvScore, nrow = nrow(xvRes), ncol = length(minXvScore), byrow = TRUE)
  
  lambdaRasterFinal = lambdaRaster[[grep('xv', names(lambdaRaster), invert = TRUE)]]
  for(inM in 1:length(cases)) {
    
    lambdaRasterMask = lambdaList[[inM]][[1]]
    
    lambdaRasterFinal = raster::mask(lambdaRasterFinal, lambdaRasterMask)
  }
  
  result = list(
    xv = xvRes, 
    xvFull = logProbFull, 
    riskAll = lambdaRasterFinal, 
    smoothingMatrix = resList, 
    folds = xvList)
  
  # estimate continuous risk at high resolution (if specified)
  if(fact > 0) {
    
    if(verbose) {
      cat(date(), '\n')
      cat('final smoothing step for bw with lowest CV score\n')
    }
    
    bwMin = result$xv[apply(result$xv[,colnames(result$xv)[-1]], 2, which.min),'bw']
    Slayers = paste('bw', format(bwMin, scientific = FALSE), '_', countcol, sep = '')
    riskFile = file.path(path, filename)
    
    result$riskEst = finalSmoothMulti(
      x = result$riskAll,
      focalMat = result$smoothingMatrix[[1]]$smoothingMatrix$focal$focal,
      Slayers = Slayers,
      fact = fact,
      filename = riskFile,
      ncores = ncores)
    
    result$bw = paste('bw', bwMin, sep = '')
  }
  
  if(verbose) {
    cat(date(), '\n')
    cat('done\n')
  }
  
  return(result)
}