In localEM: Nonparametric Modelling of Spatial Risk for Areal Disease Data

'
    %\VignetteEngine{knitr::docco_linear} 
    %\VignetteIndexEntry{Local-EM Example with Kentucky}
    '
#devtools::load_all("../pkg/localem")
#knitr::knit("../pkg/localem/inst/doc/kentucky.Rmd", 'kentucky.md')

knitr::opts_chunk$set(
    dev = 'png',
    fig.width = 4, fig.height = 2.5,
    dpi = 100, out.width = '45%', fig.ncol = 2)

# high resolution 
# either give R command argument highRes
# or have highRes variable defined as TRUE
if(!exists('highRes'))
  highRes = any(commandArgs() == 'highRes')

if(requireNamespace('Pmisc', quietly = TRUE)) {
  file.copy(system.file('src','webpage.css', package='Pmisc')
        , 'webpage.css')
  if(highRes) {
    cat(Pmisc::markdownHeader(
            title = 'High Resolution Local-EM Example with Kentucky', 
            author = 'Patrick Brown, Paul Nguyen',
            css = 'webpage.css'))
  } else {
    cat(Pmisc::markdownHeader(
            title = 'Local-EM Example with Kentucky', 
            author = 'Patrick Brown, Paul Nguyen',
            css = 'webpage.css'))
  }
  knitr::knit_hooks$set(plot = Pmisc::hook_plot_mdsubfig)
  theWidth = '45%'
} else {
  knitr::knit_hooks$set(plot = knitr::hook_plot_html)
  cat('\n# Local-EM Example with Kentucky\n')
  theWidth = '80%'
}

Introduction

The localEM package contains functions to implement the kernel smoothing local-EM algorithm$^1$ of disease data aggregated to geographical regions. This algorithm provides an nonparametric alternative to the standard geospatial models, such as the Besag-York-Mollie (BYM) model$^2$, for estimating spatial risk of areal disease data. With disease cases typically aggregated to highly coarse geographical regions (e.g., census counties, or census subdivisions), the local-EM method creates a tessellation of distinct regions by overlaying the map of these coarse regions with another map containing fine geographical regions (e.g., census tracts, census blocks, or census dissemination areas) of population data. This allows for the spatial risk to be estimated at a better resolution with the fine regions.

The methodology of this package is demonstrated on simulated lung cancer cases for the state of Kentucky, USA. The spatial polygons for the census counties and tracts of Kentucky are included with this package.

ncores = 12
cellsFine = 500
cellsCoarse = 20
nsim = 12
nxv = 6
fact = 3
Sbw = unique(round(
  exp(seq(log(4), log(35), len=15)) 
  )) * 1000
threshold = c(1.1, 1.25, 1.5)
Nboot = 100 
path = 'highResLocalem/'
cacheDir = 'highResCache/'
figDir = 'highResFigure/'
set.seed(100)

# specify number of grid cells and number of cores for computations in parallel
ncores = 2
cellsFine = 80
cellsCoarse = 8
nsim = 4
nxv = 4
fact = 2
Sbw = seq(10, 35, by = 5) * 1000
threshold = c(1, 1.5)
Nboot = 20 
path = 'lowResLocalem/'
cacheDir = 'lowResCache/'
figDir = 'lowResFigure/'
set.seed(100)

specify number of grid cells for simulation

if(requireNamespace('RandomFields', quietly = TRUE)) {
  cellsSimulate = 200
} else {
  cellsSimulate = 100  
}

require('mapmisc', quietly=TRUE)
require('rgdal', quietly=TRUE)

data('kentuckyCounty', package = 'localEM') 
data('kentuckyTract', package = 'localEM') 
data('kMap', package = 'localEM')

if(exists('persistentCache', 'package:mapmisc')) {   
  mapmisc::persistentCache() 
}
dir.create(path, showWarnings = FALSE, recursive = TRUE)
dir.create(cacheDir, showWarnings = FALSE, recursive = TRUE)
dir.create(figDir, showWarnings = FALSE, recursive = TRUE)
knitr::opts_chunk$set(fig.path=figDir, cache.path=cacheDir)

kMap = mapmisc::tonerToTrans(
  mapmisc::openmap(kentuckyCounty, fact=1.6, path='stamen-toner'))

Simulate Cases

Using the simLgcp() function from the geostatsp package, case locations are simulated with the log Gaussian Cox process (LGCP) and following parameters:

mean: 0
variance: 0.16
shape: 2
range: 120 km
offsets: log of expected cases/m$^2$ of the census tracts

kentuckyOffset = geostatsp::spdfToBrick(
    kentuckyTract,
    geostatsp::squareRaster(kentuckyTract, cellsSimulate),
    pattern = '^expected$',
    logSumExpected = TRUE)

set.seed(0)
kCases = geostatsp::simLgcp(
    param = c(mean = 0, variance = 0.4^2, range = 120 * 1000, shape = 2),
    covariates = list(logExpected = kentuckyOffset), 
    offset = 'logExpected', n = nsim)

The simulated cases are then aggregated to the appropriate counties. Plots of the relative intensity, event locations and aggregated data are provided for the first simulated dataset.

kCases$agg = lapply(
    kCases[grep('^events[[:digit:]]+?', names(kCases))],
    function(qq) over(qq, kentuckyCounty)[,'id']
)

countyCounts = as.data.frame(lapply(
        kCases$agg,  
        function(xx) {
          as.vector(table(xx, exclude = NULL)[
                  as.character(kentuckyCounty$id)])
        }
    ))

countyCounts[is.na(countyCounts)] = 0
names(countyCounts) = gsub('^events', 'count', names(countyCounts))
rownames(countyCounts) = as.character(kentuckyCounty$id)
kentuckyCounty = merge(kentuckyCounty, countyCounts, by.x = 'id', by.y = 'row.names')

temp = aggregate(list(logExpected = kentuckyTract$expected), list(id = kentuckyTract$id2),FUN = function(x){log(sum(x))})
kentuckyCounty = merge(kentuckyCounty, temp) #for plot
kentuckyCounty$expected = exp(kentuckyCounty$logExpected)

# offset map
oCol = colourScale(
    breaks=c(0, 0.01, 0.02, 0.05, 0.1, 1,4), 
    style = 'fixed',
    col = 'YlOrRd')

map.new(kentuckyTract)#, 
#    mar = c(0, 0, 1, 0), 
#    main = 'Offsets')
plot(kentuckyOffset, 
    col = oCol$col, 
    breaks = pmax(-100, log(oCol$breaks)) - 6*log(10), 
    legend = FALSE, 
    add = TRUE)
plot(kMap, add = TRUE, maxpixels=10^7)
legendBreaks('topleft', 
    breaks = oCol$breaks, col = oCol$col, 
    title = expression(plain('cases/km')^2), 
    bg = 'white')


# simulated risk map
iCol = colourScale(
    kCases$raster$relativeIntensity1, 
    breaks = 8, style = 'equal', dec = -log10(0.5))

map.new(kentuckyTract#, 
#    mar = c(0, 0, 1, 0), 
#    main = 'Simulated Risk'
)
plot(kCases$raster$relativeIntensity1, 
    col = iCol$col, breaks = iCol$breaks, 
    legend = FALSE, 
    add = TRUE)
plot(kMap, add = TRUE)
legendBreaks('topleft', 
    col = iCol$col, breaks = iCol$breaks, 
    title = 'RR', 
    bg = 'white')


# simulated events map
map.new(kentuckyTract#, 
#    mar = c(0, 0, 1, 0), 
#    main = 'Simulated Events'
)
plot(kMap, add = TRUE)
points(kCases$events1, col = '#FF000030', pch = 20, cex = 0.5)
scaleBar(kentuckyCounty, 
    pos = 'topleft', 
    bg = 'white')


# simulated counts
cCol = colourScale(
    kentuckyCounty$count1, 
    breaks = 10, style = 'quantile', dec = -1)

map.new(kentuckyTract#, 
#    mar = c(0, 0, 1, 0), 
#    main = 'Aggregated Counts'
)
plot(kentuckyCounty, col = cCol$plot, add = TRUE)
plot(kMap, add = TRUE)
legendBreaks('topleft', cCol)
scaleBar(kentuckyCounty, 
    pos = 'topleft', 
    inset = c(0.2, 0), 
    bg = 'white')


# offset by county
cCol = colourScale(
    kentuckyCounty$expected, 
    breaks = 10, style = 'quantile', dec = -1)

map.new(kentuckyTract#, 
#    mar = c(0, 0, 1, 0), 
#    main = 'Aggregated Counts'
)
plot(kentuckyCounty, col = cCol$plot, add = TRUE)
plot(kMap, add = TRUE)
legendBreaks('topleft', cCol)
scaleBar(kentuckyCounty, 
    pos = 'topleft', 
    inset = c(0.2, 0), 
    bg = 'white')

#SIR
kentuckyCounty$SIR1 = kentuckyCounty$count1/kentuckyCounty$expected

cCol = colourScale(
    kentuckyCounty$SIR1, 
    breaks = 8, style = 'equal', dec = -log10(0.5))

map.new(kentuckyTract#, 
#    mar = c(0, 0, 1, 0), 
#    main = 'Aggregated Counts'
)
plot(kentuckyCounty, col = cCol$plot, add = TRUE)
plot(kMap, add = TRUE)
legendBreaks('topleft', cCol)
scaleBar(kentuckyCounty, 
    pos = 'topleft', 
    inset = c(0.2, 0), 
    bg = 'white')

Cross-validation

The local-EM algorithm requires a smoothing parameter called the bandwidth to estimate the spatial risk. Small values of the bandwidth yield estimates similar to standardized incidence ratios of each areal regions, while large values yield estimates to the overall mean incidence ratio of the entire study area (i.e., [total counts]/[total offsets]). The preferred or optimal bandwidth for the disease data is one that minimizes the trade-off between the bias and variance of the estimator.

To automatic the selection of the optimal bandwidth, the lemXv() function of this package implements a likelihood cross-validation (CV) approach with the set of specified bandwidths. CV scores are computed with $k$-fold sampling without replacement of the dataset. The optimal bandwidth is the one that yields the smallest CV score.

The CV scores with 4-fold sampling are provided for the first simulated dataset.

library('localEM')

fileHere = file.path(path, 'xvKentucky.rds')

if(!file.exists(fileHere)) {

  xvKentucky = lemXv(
      cases = kentuckyCounty[,c('id','count1')], 
      population = kentuckyTract, 
      cellsCoarse = cellsCoarse,  
      cellsFine = cellsFine, 
      bw = Sbw, 
      xv = nxv, 
      ncores = ncores, 
      path = path, 
      verbose = TRUE)

  saveRDS(xvKentucky, file = fileHere)
} else {
  xvKentucky = readRDS(fileHere)
}

plot(xvKentucky$xv[,1]/1000, xvKentucky$xv[,2], 
    xlab = 'km', ylab = '-log p', 
    type = 'o', xlim = c(9, 40), ylim = c(0, 20))

Risk Estimation

The R objects created from the lemXv() function also contain the local-EM risk estimation done with the optimal bandwidth found in the CV approach. High-resolution plots of the local-EM estimation with their optimal bandwidths are provided for all simulated datasets.

The riskEst() function allows local-EM risk estimation to be done with specified bandwidths. Plots of the local-EM estimation with all bandwidths used in this example are provided for the first simulated dataset. The plots show high cancer risk in the areas located east of Bowling Green and south of Frankfurt decreasing as the bandwidth increases.

riskKentucky = riskEst(
    cases = kentuckyCounty[,c('id','count1')], 
    lemObjects = xvKentucky$smoothingMatrix, 
    bw = Sbw,
    ncores = ncores,
    path = cacheDir)

# estimated risk maps
toPlot = brick(filename(riskKentucky$riskEst))[[
  round(seq(1, nlayers(riskKentucky$riskEst), len=6))
  ]]
SbwShort = as.numeric(gsub("^bw|_[[:alnum:]]+$", "", names(toPlot)))

rCol = colourScale(
    c(0, maxValue(toPlot)), 
    breaks = 9, style = 'equal', 
    dec = -log10(0.5))

for(inR in 1:nlayers(toPlot)) {
  map.new(kentuckyCounty)#, 
#      mar = c(0, 0, 1, 0), 
#      main = paste('Simulation 1: Estimated Risk, bw=', Sbw[inR] / 1000, ' km', sep = ''))
  plot(toPlot[[inR]], 
      col = rCol$col, breaks = rCol$breaks, 
      legend = FALSE, 
      add = TRUE)
  plot(kMap, add = TRUE, maxpixels=10^7)
  legendBreaks('topleft', rCol, bg = 'white')
}

Uncertainty Estimation

To measure the uncertainty of the local-EM algorithm, the excProb() function computes the exceedance probabilities with the same bandwidth parameter used in the risk estimation. Bootstrapping from a Poisson process is used to simulate the events for calculating these exceedance probabilities.

Specifically, under the assumption that disease events are a realisation from the background population and constant risk threshold, cases are bootstrapped and randomly aggregated to the areal regions. Using the same bandwidth as the observed data, the local-EM risk is then estimated for each of the bootstrapped data. Afterwards, exceedance probabilities are computed as the proportion of the observed risk estimate at least as large as the ones of the bootstrap data. Large exceedance probabilities are consistent with the risk being greater than the specified threshold.

excProbKentucky = excProb(
    lemObjects = xvKentucky, 
    threshold = threshold, 
    Nboot = Nboot, 
    fact = 2,
    ncores = ncores,
    path = path,
    verbose=TRUE)

toPlot = excProbKentucky$excProb

pCol = colourScale(
    toPlot, 
    breaks = c(0, 0.2, 0.8, 0.95, 1), style = 'fixed', 
    col = c('green', 'yellow', 'orange', 'red'))                                

for(inT in 1:nlayers(toPlot)) {

  map.new(kentuckyCounty)#, 
#      mar = c(0, 0, 1, 0), 
#      main = paste('Simulation 1: Exceedance Probabilities, t=', threshold[inT], sep = ''))
  plot(toPlot[[inT]], 
      col = pCol$col, breaks = pCol$breaks, 
      legend = FALSE, 
      add = TRUE)
  plot(kMap, add = TRUE, maxpixels=10^7)
}
legendBreaks('topleft', pCol, bg = 'white')


map.new(kentuckyTract#, 
#    mar = c(0, 0, 1, 0), 
#    main = 'Simulated Risk'
)
plot(kCases$raster$relativeIntensity1, 
    col = iCol$col, breaks = iCol$breaks, 
    legend = FALSE, 
    add = TRUE)
plot(kMap, add = TRUE, maxpixels=10^7)
legendBreaks('topleft', 
    col = iCol$col, breaks = iCol$breaks, 
    title = 'RR', 
    bg = 'white')

map.new(kentuckyCounty)
plot(
    xvKentucky$riskEst,
    col = iCol$col, breaks = iCol$breaks, 
    legend = FALSE, 
    add = TRUE)
plot(kMap, add = TRUE, maxpixels=10^7)
legendBreaks('topleft', iCol, bg = 'white')

Multiple datasets

Using the R objects created from the lemXv() function for the first simulated dataset, this CV method can be efficiently implemented on the remaining simulated datasets.

xvAllKentucky = lemXv(
    cases = kentuckyCounty@data[,
        grep('^count[[:digit:]]', names(kentuckyCounty))], 
    lemObjects = xvKentucky$smoothingMatrix,
    ncores = ncores,
    path = cacheDir, verbose=TRUE)

maxY = min(max(xvAllKentucky$xv[,-1]), 30)

Scol = RColorBrewer::brewer.pal(4, "Dark2")
ScolFull = c(Scol,
    rep(mapmisc::col2html('grey', 0.5), 
        nrow(xvAllKentucky$xv) - length(Scol) )
)

matplot(xvAllKentucky$xv[,'bw']/1000, xvAllKentucky$xv[,-1], 
    main = 'CV Scores for All Simulations', 
    xlab = 'km', ylab = '-log p', 
    type = 'l', lty = 1, 
    col = ScolFull,
    ylim = c(0, 12), log='x')

legend('topright', paste('Simulation ', 1:4, sep = ''), 
    lty = 1, col = Scol, 
    cex = 0.6)

toPlot = brick(filename(xvAllKentucky$riskEst))

rCol = colourScale(
    breaks = c(seq(0,3, by=0.5), 4),
    style = 'fixed')

#bw = as.numeric(gsub('^bw', '', xvAllKentucky$bw))

for(inR in 1:nlayers(toPlot)) {

  map.new(kentuckyCounty)#, 
#      mar = c(0, 0, 1, 0), 
#      main = paste('Simulation ', inR, ': Estimated Risk, bw=', bw[inR] / 1000, ' km', sep = ''))
  plot(toPlot[[inR]], 
      col = rCol$col, breaks = rCol$breaks, 
      legend = FALSE, 
      add = TRUE)
  plot(kMap, add = TRUE)
  legendBreaks('topleft', rCol, bg = 'white')

map.new(kentuckyTract#, 
#    mar = c(0, 0, 1, 0), 
#    main = 'Simulated Risk'
)

plot(kCases$raster[[paste('relativeIntensity',inR, sep='')]], 
    col = rCol$col, breaks = rCol$breaks, 
    legend = FALSE, 
    add = TRUE)
plot(kMap, add = TRUE)
legendBreaks('topleft', 
    col = rCol$col, breaks = rCol$breaks, 
    title = 'RR', 
    bg = 'white')
}

Compute exceedance probabilities for both LocalEM and BYM model:

library(INLA)
cat('Computing risk estimation and exceedance probabilities using BYM model...', '\n')
theBymRiskList = theBymExcProbList = as.list(1:nsim)
names(theBymRiskList) = names(theBymExcProbList) = paste("count",1:nsim,sep="")

  for (i in 1:nsim){
    cat("Fitting BYM model for simulation:", i,"...\n")
    formula = as.formula(paste('count', i, ' ~ offset(logExpected)', sep = ''))
    kBYM = diseasemapping::bym(formula, data = kentuckyCounty,priorCI = list(sd = c(1, 0.05), propSpatial = c(0.2, 0.95)))

    #raster bym estimate
    kBYMRaster = raster::rasterize(kBYM$data, y = toPlot[[i]], field = 'fitted.exp')
    names(kBYMRaster) = 'fitted.exp'
    theBymRiskList[[i]] = kBYMRaster

    #compute exceedance probs
    theBymMarg =kBYM$inla$marginals.fitted.bym
    for(inT in 1:length(threshold)) {

      kBYM$data$excProb = geostatsp::excProb(theBymMarg, threshold = log(threshold[inT]))
      theBymExcProb = raster::rasterize( kBYM$data, y = toPlot[[i]], field = 'excProb')

      if(inT == 1) {
        bymExcProbRaster = theBymExcProb
      } else {
        bymExcProbRaster = raster::addLayer(bymExcProbRaster, theBymExcProb)
      }
    }
    names(bymExcProbRaster) = paste('count', i, '_threshold', threshold, sep = '')
    theBymExcProbList[[i]] = bymExcProbRaster

}

##output results to external file
myBymRisk = raster::writeRaster(
  raster::stack(theBymRiskList), 
  filename = file.path(path, 'bymRiskR.grd'), 
  overwrite = file.exists(file.path(path, 'bymRiskR.grd')))
myBymExcProb = raster::writeRaster(
  raster::stack(theBymExcProbList), 
  filename = file.path(path, 'bymExcProbR.grd'), 
  overwrite = file.exists(file.path(path, 'bymExcProbR.grd')))


#Compute excProb for localEM

theFileExcProb = file.path(path, paste('lemExcProbR.grd', sep = ''))



if(!file.exists(theFileExcProb)) {

  myLemXv = xvAllKentucky$xv
  myLemRisk = xvAllKentucky$riskEst


  for(inS in 1:nsim) {

    bw = xvAllKentucky$bw[inS]
    xvAllKentucky$riskEst = myLemRisk[[inS]]

    cat('Computing localEM exceedance probabilities for simulation:',inS, '\n')
    excProb = localEM::excProb(
        lemObjects =  xvAllKentucky,
        threshold = threshold,
        Nboot = Nboot,
        bw = bw,
        fact = fact,
        ncores = min(ncores, 2),
        iterations = list(tol = 1e-7, maxIter = 3000, gpu = FALSE),
        path = path,
        filename = paste('lemExcProbR', inS, '.grd', sep = ''),
        verbose = TRUE)

    theLemExcProb = excProb$excProb

    if(inS == 1) {
    theLemExcProbStack = theLemExcProb * 1
    } else {
     theLemExcProbStack = raster::addLayer(theLemExcProbStack, theLemExcProb * 1)
    }
  }

  myLemExcProb = raster::writeRaster(
    theLemExcProbStack, 
    filename = theFileExcProb,
    overwrite = file.exists(theFileExcProb))

} else {
   myLemExcProb = stack(theFileExcProb)
}

Compute ROC curves for both LocalEM and BYM model:

theSpec = seq(0, 1, by = 1/Nboot)

myLemRoc = list()
myBymRoc = list()
mySimSurface = kCases$raster[[grep("^relativeIntensity[[:digit:]]", names(kCases$raster))]]

  for(inS in 1:nsim) {

    ##simulation surface
    theSimSurface = mySimSurface[[inS]]
    names(theSimSurface) = 'relativeIntensity'

    ##ROC from local-EM analysis
    cat("Computing ROC curves for simulation:",inS,"...\n")

    cat("localEM...\n")
    lemExcProb = myLemExcProb[[grep(paste('(count|case)', inS, '_', sep = ''), names(myLemExcProb))]]
    names(lemExcProb) = gsub('^bw[[:digit:]]+\\_(count|case)[[:digit:]]+\\_', '', names(lemExcProb))
    names(lemExcProb) = gsub('^threshold', 'threshold.', names(lemExcProb))


    lemRoc = try(
      geostatsp::spatialRoc(
        fit = lemExcProb, 
        rr = threshold, 
        truth = theSimSurface, 
        random = FALSE, 
        spec = theSpec), 
      silent = TRUE)

    if(class(lemRoc) != 'try-error') {
      myLemRoc[[inS]] = lemRoc[,]
    }


    cat("BYM...\n")
    ##ROC from BYM analysis
    bymExcProb = myBymExcProb[[grep(paste('(count|case)', inS, '_', sep = ''), names(myBymExcProb))]]
    names(bymExcProb) = gsub('^(count|case)[[:digit:]]+\\_', '', names(bymExcProb))
    names(bymExcProb) = gsub('^threshold', 'threshold.', names(bymExcProb))

    bymRoc = try(
      geostatsp::spatialRoc(
        fit = bymExcProb, 
        rr = threshold, 
        truth = theSimSurface, 
        random = FALSE, 
        spec = theSpec), 
      silent = TRUE)

    if(class(bymRoc) != 'try-error') {
      myBymRoc[[inS]] = bymRoc[,]
    }
  }    

save(myLemRoc, myBymRoc, file = paste(path,"rocEstRKentucky.RData", sep = "")) 


cat(date(), '\n')
cat('done', '\n')



require(ggplot2)
###Plot
roc2Plot = data.frame()
for (i in 1:length(myLemRoc)){
   rocP = data.frame(rbind(myLemRoc[[i]], myBymRoc[[i]]))
  rocP$model = c(rep("LEM",length(theSpec)),rep("BYM",length(theSpec)))
  rocPlong = reshape(rocP,varying = list(grep("^X",names(rocP))),idvar = c("onemspec", "model"), direction = "long", 
                   v.names = "Sensitivity",timevar = "threshold",times =as.character(gsub("^X","",names(rocP)[grep("^X",names(rocP))])))
  rocPlong$sim = i
  roc2Plot = rbind(roc2Plot,rocPlong)
}

roc2Plot = aggregate(Sensitivity ~ onemspec + model + threshold, data = roc2Plot,mean,na.rm = T)

p = ggplot(data = roc2Plot,aes(x = onemspec, y = Sensitivity, color = model, linetype = threshold, group = interaction(model, threshold)))  +   geom_line() + xlab("1-Specificity")
print(p)

References

Nguyen P, Brown PE, Stafford J. Mapping cancer risk in southwestern Ontario with changing census boundaries. Biometrics. 2012; 68(4): 1229-37.
Besag J, York J, Mollie A. Bayesian image restoration, with two applications in spatial statistics. Ann Inst Statist Math. 1991; 43(1): 1-59.