' %\VignetteEngine{knitr::docco_linear} %\VignetteIndexEntry{Local-EM Example with Kentucky} ' #devtools::load_all("../pkg/localem") #knitr::knit("../pkg/localem/inst/doc/kentucky.Rmd", 'kentucky.md')
knitr::opts_chunk$set( dev = 'png', fig.width = 4, fig.height = 2.5, dpi = 100, out.width = '45%', fig.ncol = 2) # high resolution # either give R command argument highRes # or have highRes variable defined as TRUE if(!exists('highRes')) highRes = any(commandArgs() == 'highRes') if(requireNamespace('Pmisc', quietly = TRUE)) { file.copy(system.file('src','webpage.css', package='Pmisc') , 'webpage.css') if(highRes) { cat(Pmisc::markdownHeader( title = 'High Resolution Local-EM Example with Kentucky', author = 'Patrick Brown, Paul Nguyen', css = 'webpage.css')) } else { cat(Pmisc::markdownHeader( title = 'Local-EM Example with Kentucky', author = 'Patrick Brown, Paul Nguyen', css = 'webpage.css')) } knitr::knit_hooks$set(plot = Pmisc::hook_plot_mdsubfig) theWidth = '45%' } else { knitr::knit_hooks$set(plot = knitr::hook_plot_html) cat('\n# Local-EM Example with Kentucky\n') theWidth = '80%' }
The localEM
package contains functions to implement the kernel smoothing local-EM algorithm$^1$ of disease data aggregated to geographical regions. This algorithm provides an nonparametric alternative to the standard geospatial models, such as the Besag-York-Mollie (BYM) model$^2$, for estimating spatial risk of areal disease data. With disease cases typically aggregated to highly coarse geographical regions (e.g., census counties, or census subdivisions), the local-EM method creates a tessellation of distinct regions by overlaying the map of these coarse regions with another map containing fine geographical regions (e.g., census tracts, census blocks, or census dissemination areas) of population data. This allows for the spatial risk to be estimated at a better resolution with the fine regions.
The methodology of this package is demonstrated on simulated lung cancer cases for the state of Kentucky, USA. The spatial polygons for the census counties and tracts of Kentucky are included with this package.
ncores = 12 cellsFine = 500 cellsCoarse = 20 nsim = 12 nxv = 6 fact = 3 Sbw = unique(round( exp(seq(log(4), log(35), len=15)) )) * 1000 threshold = c(1.1, 1.25, 1.5) Nboot = 100 path = 'highResLocalem/' cacheDir = 'highResCache/' figDir = 'highResFigure/' set.seed(100)
# specify number of grid cells and number of cores for computations in parallel ncores = 2 cellsFine = 80 cellsCoarse = 8 nsim = 4 nxv = 4 fact = 2 Sbw = seq(10, 35, by = 5) * 1000 threshold = c(1, 1.5) Nboot = 20 path = 'lowResLocalem/' cacheDir = 'lowResCache/' figDir = 'lowResFigure/' set.seed(100)
specify number of grid cells for simulation
if(requireNamespace('RandomFields', quietly = TRUE)) { cellsSimulate = 200 } else { cellsSimulate = 100 }
require('mapmisc', quietly=TRUE) require('rgdal', quietly=TRUE) data('kentuckyCounty', package = 'localEM') data('kentuckyTract', package = 'localEM') data('kMap', package = 'localEM')
if(exists('persistentCache', 'package:mapmisc')) { mapmisc::persistentCache() } dir.create(path, showWarnings = FALSE, recursive = TRUE) dir.create(cacheDir, showWarnings = FALSE, recursive = TRUE) dir.create(figDir, showWarnings = FALSE, recursive = TRUE) knitr::opts_chunk$set(fig.path=figDir, cache.path=cacheDir)
kMap = mapmisc::tonerToTrans( mapmisc::openmap(kentuckyCounty, fact=1.6, path='stamen-toner'))
Using the simLgcp()
function from the geostatsp
package, case locations are simulated with the log Gaussian Cox process (LGCP) and following parameters:
kentuckyOffset = geostatsp::spdfToBrick( kentuckyTract, geostatsp::squareRaster(kentuckyTract, cellsSimulate), pattern = '^expected$', logSumExpected = TRUE) set.seed(0) kCases = geostatsp::simLgcp( param = c(mean = 0, variance = 0.4^2, range = 120 * 1000, shape = 2), covariates = list(logExpected = kentuckyOffset), offset = 'logExpected', n = nsim)
The simulated cases are then aggregated to the appropriate counties. Plots of the relative intensity, event locations and aggregated data are provided for the first simulated dataset.
kCases$agg = lapply( kCases[grep('^events[[:digit:]]+?', names(kCases))], function(qq) over(qq, kentuckyCounty)[,'id'] ) countyCounts = as.data.frame(lapply( kCases$agg, function(xx) { as.vector(table(xx, exclude = NULL)[ as.character(kentuckyCounty$id)]) } )) countyCounts[is.na(countyCounts)] = 0 names(countyCounts) = gsub('^events', 'count', names(countyCounts)) rownames(countyCounts) = as.character(kentuckyCounty$id) kentuckyCounty = merge(kentuckyCounty, countyCounts, by.x = 'id', by.y = 'row.names') temp = aggregate(list(logExpected = kentuckyTract$expected), list(id = kentuckyTract$id2),FUN = function(x){log(sum(x))}) kentuckyCounty = merge(kentuckyCounty, temp) #for plot kentuckyCounty$expected = exp(kentuckyCounty$logExpected)
# offset map oCol = colourScale( breaks=c(0, 0.01, 0.02, 0.05, 0.1, 1,4), style = 'fixed', col = 'YlOrRd') map.new(kentuckyTract)#, # mar = c(0, 0, 1, 0), # main = 'Offsets') plot(kentuckyOffset, col = oCol$col, breaks = pmax(-100, log(oCol$breaks)) - 6*log(10), legend = FALSE, add = TRUE) plot(kMap, add = TRUE, maxpixels=10^7) legendBreaks('topleft', breaks = oCol$breaks, col = oCol$col, title = expression(plain('cases/km')^2), bg = 'white') # simulated risk map iCol = colourScale( kCases$raster$relativeIntensity1, breaks = 8, style = 'equal', dec = -log10(0.5)) map.new(kentuckyTract#, # mar = c(0, 0, 1, 0), # main = 'Simulated Risk' ) plot(kCases$raster$relativeIntensity1, col = iCol$col, breaks = iCol$breaks, legend = FALSE, add = TRUE) plot(kMap, add = TRUE) legendBreaks('topleft', col = iCol$col, breaks = iCol$breaks, title = 'RR', bg = 'white') # simulated events map map.new(kentuckyTract#, # mar = c(0, 0, 1, 0), # main = 'Simulated Events' ) plot(kMap, add = TRUE) points(kCases$events1, col = '#FF000030', pch = 20, cex = 0.5) scaleBar(kentuckyCounty, pos = 'topleft', bg = 'white') # simulated counts cCol = colourScale( kentuckyCounty$count1, breaks = 10, style = 'quantile', dec = -1) map.new(kentuckyTract#, # mar = c(0, 0, 1, 0), # main = 'Aggregated Counts' ) plot(kentuckyCounty, col = cCol$plot, add = TRUE) plot(kMap, add = TRUE) legendBreaks('topleft', cCol) scaleBar(kentuckyCounty, pos = 'topleft', inset = c(0.2, 0), bg = 'white') # offset by county cCol = colourScale( kentuckyCounty$expected, breaks = 10, style = 'quantile', dec = -1) map.new(kentuckyTract#, # mar = c(0, 0, 1, 0), # main = 'Aggregated Counts' ) plot(kentuckyCounty, col = cCol$plot, add = TRUE) plot(kMap, add = TRUE) legendBreaks('topleft', cCol) scaleBar(kentuckyCounty, pos = 'topleft', inset = c(0.2, 0), bg = 'white') #SIR kentuckyCounty$SIR1 = kentuckyCounty$count1/kentuckyCounty$expected cCol = colourScale( kentuckyCounty$SIR1, breaks = 8, style = 'equal', dec = -log10(0.5)) map.new(kentuckyTract#, # mar = c(0, 0, 1, 0), # main = 'Aggregated Counts' ) plot(kentuckyCounty, col = cCol$plot, add = TRUE) plot(kMap, add = TRUE) legendBreaks('topleft', cCol) scaleBar(kentuckyCounty, pos = 'topleft', inset = c(0.2, 0), bg = 'white')
The local-EM algorithm requires a smoothing parameter called the bandwidth to estimate the spatial risk. Small values of the bandwidth yield estimates similar to standardized incidence ratios of each areal regions, while large values yield estimates to the overall mean incidence ratio of the entire study area (i.e., [total counts]/[total offsets]). The preferred or optimal bandwidth for the disease data is one that minimizes the trade-off between the bias and variance of the estimator.
To automatic the selection of the optimal bandwidth, the lemXv()
function of this package implements a likelihood cross-validation (CV) approach with the set of specified bandwidths. CV scores are computed with $k$-fold sampling without replacement of the dataset. The optimal bandwidth is the one that yields the smallest CV score.
The CV scores with 4-fold sampling are provided for the first simulated dataset.
library('localEM') fileHere = file.path(path, 'xvKentucky.rds') if(!file.exists(fileHere)) { xvKentucky = lemXv( cases = kentuckyCounty[,c('id','count1')], population = kentuckyTract, cellsCoarse = cellsCoarse, cellsFine = cellsFine, bw = Sbw, xv = nxv, ncores = ncores, path = path, verbose = TRUE) saveRDS(xvKentucky, file = fileHere) } else { xvKentucky = readRDS(fileHere) }
plot(xvKentucky$xv[,1]/1000, xvKentucky$xv[,2], xlab = 'km', ylab = '-log p', type = 'o', xlim = c(9, 40), ylim = c(0, 20))
The R objects created from the lemXv()
function also contain the local-EM risk
estimation done with the optimal bandwidth found in the CV approach.
High-resolution plots of the local-EM estimation with their optimal bandwidths are
provided for all simulated datasets.
The riskEst()
function allows local-EM risk estimation to be done with
specified bandwidths. Plots of the local-EM estimation with all bandwidths used
in this example are provided for the first simulated dataset. The plots show
high cancer risk in the areas located east of Bowling Green and
south of Frankfurt decreasing as the bandwidth increases.
riskKentucky = riskEst( cases = kentuckyCounty[,c('id','count1')], lemObjects = xvKentucky$smoothingMatrix, bw = Sbw, ncores = ncores, path = cacheDir)
# estimated risk maps toPlot = brick(filename(riskKentucky$riskEst))[[ round(seq(1, nlayers(riskKentucky$riskEst), len=6)) ]] SbwShort = as.numeric(gsub("^bw|_[[:alnum:]]+$", "", names(toPlot)))
rCol = colourScale( c(0, maxValue(toPlot)), breaks = 9, style = 'equal', dec = -log10(0.5)) for(inR in 1:nlayers(toPlot)) { map.new(kentuckyCounty)#, # mar = c(0, 0, 1, 0), # main = paste('Simulation 1: Estimated Risk, bw=', Sbw[inR] / 1000, ' km', sep = '')) plot(toPlot[[inR]], col = rCol$col, breaks = rCol$breaks, legend = FALSE, add = TRUE) plot(kMap, add = TRUE, maxpixels=10^7) legendBreaks('topleft', rCol, bg = 'white') }
To measure the uncertainty of the local-EM algorithm, the excProb()
function
computes the exceedance probabilities with the same bandwidth parameter used in
the risk estimation. Bootstrapping from a Poisson process is used to simulate
the events for calculating these exceedance probabilities.
Specifically, under the assumption that disease events are a realisation from the background population and constant risk threshold, cases are bootstrapped and randomly aggregated to the areal regions. Using the same bandwidth as the observed data, the local-EM risk is then estimated for each of the bootstrapped data. Afterwards, exceedance probabilities are computed as the proportion of the observed risk estimate at least as large as the ones of the bootstrap data. Large exceedance probabilities are consistent with the risk being greater than the specified threshold.
excProbKentucky = excProb( lemObjects = xvKentucky, threshold = threshold, Nboot = Nboot, fact = 2, ncores = ncores, path = path, verbose=TRUE)
toPlot = excProbKentucky$excProb pCol = colourScale( toPlot, breaks = c(0, 0.2, 0.8, 0.95, 1), style = 'fixed', col = c('green', 'yellow', 'orange', 'red')) for(inT in 1:nlayers(toPlot)) { map.new(kentuckyCounty)#, # mar = c(0, 0, 1, 0), # main = paste('Simulation 1: Exceedance Probabilities, t=', threshold[inT], sep = '')) plot(toPlot[[inT]], col = pCol$col, breaks = pCol$breaks, legend = FALSE, add = TRUE) plot(kMap, add = TRUE, maxpixels=10^7) } legendBreaks('topleft', pCol, bg = 'white') map.new(kentuckyTract#, # mar = c(0, 0, 1, 0), # main = 'Simulated Risk' ) plot(kCases$raster$relativeIntensity1, col = iCol$col, breaks = iCol$breaks, legend = FALSE, add = TRUE) plot(kMap, add = TRUE, maxpixels=10^7) legendBreaks('topleft', col = iCol$col, breaks = iCol$breaks, title = 'RR', bg = 'white') map.new(kentuckyCounty) plot( xvKentucky$riskEst, col = iCol$col, breaks = iCol$breaks, legend = FALSE, add = TRUE) plot(kMap, add = TRUE, maxpixels=10^7) legendBreaks('topleft', iCol, bg = 'white')
Using the R objects created from the lemXv()
function for the first simulated dataset, this CV method can be efficiently implemented on the remaining simulated datasets.
xvAllKentucky = lemXv( cases = kentuckyCounty@data[, grep('^count[[:digit:]]', names(kentuckyCounty))], lemObjects = xvKentucky$smoothingMatrix, ncores = ncores, path = cacheDir, verbose=TRUE)
maxY = min(max(xvAllKentucky$xv[,-1]), 30) Scol = RColorBrewer::brewer.pal(4, "Dark2") ScolFull = c(Scol, rep(mapmisc::col2html('grey', 0.5), nrow(xvAllKentucky$xv) - length(Scol) ) ) matplot(xvAllKentucky$xv[,'bw']/1000, xvAllKentucky$xv[,-1], main = 'CV Scores for All Simulations', xlab = 'km', ylab = '-log p', type = 'l', lty = 1, col = ScolFull, ylim = c(0, 12), log='x') legend('topright', paste('Simulation ', 1:4, sep = ''), lty = 1, col = Scol, cex = 0.6)
toPlot = brick(filename(xvAllKentucky$riskEst)) rCol = colourScale( breaks = c(seq(0,3, by=0.5), 4), style = 'fixed') #bw = as.numeric(gsub('^bw', '', xvAllKentucky$bw)) for(inR in 1:nlayers(toPlot)) { map.new(kentuckyCounty)#, # mar = c(0, 0, 1, 0), # main = paste('Simulation ', inR, ': Estimated Risk, bw=', bw[inR] / 1000, ' km', sep = '')) plot(toPlot[[inR]], col = rCol$col, breaks = rCol$breaks, legend = FALSE, add = TRUE) plot(kMap, add = TRUE) legendBreaks('topleft', rCol, bg = 'white') map.new(kentuckyTract#, # mar = c(0, 0, 1, 0), # main = 'Simulated Risk' ) plot(kCases$raster[[paste('relativeIntensity',inR, sep='')]], col = rCol$col, breaks = rCol$breaks, legend = FALSE, add = TRUE) plot(kMap, add = TRUE) legendBreaks('topleft', col = rCol$col, breaks = rCol$breaks, title = 'RR', bg = 'white') }
Compute exceedance probabilities for both LocalEM and BYM model:
library(INLA) cat('Computing risk estimation and exceedance probabilities using BYM model...', '\n') theBymRiskList = theBymExcProbList = as.list(1:nsim) names(theBymRiskList) = names(theBymExcProbList) = paste("count",1:nsim,sep="") for (i in 1:nsim){ cat("Fitting BYM model for simulation:", i,"...\n") formula = as.formula(paste('count', i, ' ~ offset(logExpected)', sep = '')) kBYM = diseasemapping::bym(formula, data = kentuckyCounty,priorCI = list(sd = c(1, 0.05), propSpatial = c(0.2, 0.95))) #raster bym estimate kBYMRaster = raster::rasterize(kBYM$data, y = toPlot[[i]], field = 'fitted.exp') names(kBYMRaster) = 'fitted.exp' theBymRiskList[[i]] = kBYMRaster #compute exceedance probs theBymMarg =kBYM$inla$marginals.fitted.bym for(inT in 1:length(threshold)) { kBYM$data$excProb = geostatsp::excProb(theBymMarg, threshold = log(threshold[inT])) theBymExcProb = raster::rasterize( kBYM$data, y = toPlot[[i]], field = 'excProb') if(inT == 1) { bymExcProbRaster = theBymExcProb } else { bymExcProbRaster = raster::addLayer(bymExcProbRaster, theBymExcProb) } } names(bymExcProbRaster) = paste('count', i, '_threshold', threshold, sep = '') theBymExcProbList[[i]] = bymExcProbRaster } ##output results to external file myBymRisk = raster::writeRaster( raster::stack(theBymRiskList), filename = file.path(path, 'bymRiskR.grd'), overwrite = file.exists(file.path(path, 'bymRiskR.grd'))) myBymExcProb = raster::writeRaster( raster::stack(theBymExcProbList), filename = file.path(path, 'bymExcProbR.grd'), overwrite = file.exists(file.path(path, 'bymExcProbR.grd'))) #Compute excProb for localEM theFileExcProb = file.path(path, paste('lemExcProbR.grd', sep = '')) if(!file.exists(theFileExcProb)) { myLemXv = xvAllKentucky$xv myLemRisk = xvAllKentucky$riskEst for(inS in 1:nsim) { bw = xvAllKentucky$bw[inS] xvAllKentucky$riskEst = myLemRisk[[inS]] cat('Computing localEM exceedance probabilities for simulation:',inS, '\n') excProb = localEM::excProb( lemObjects = xvAllKentucky, threshold = threshold, Nboot = Nboot, bw = bw, fact = fact, ncores = min(ncores, 2), iterations = list(tol = 1e-7, maxIter = 3000, gpu = FALSE), path = path, filename = paste('lemExcProbR', inS, '.grd', sep = ''), verbose = TRUE) theLemExcProb = excProb$excProb if(inS == 1) { theLemExcProbStack = theLemExcProb * 1 } else { theLemExcProbStack = raster::addLayer(theLemExcProbStack, theLemExcProb * 1) } } myLemExcProb = raster::writeRaster( theLemExcProbStack, filename = theFileExcProb, overwrite = file.exists(theFileExcProb)) } else { myLemExcProb = stack(theFileExcProb) }
Compute ROC curves for both LocalEM and BYM model:
theSpec = seq(0, 1, by = 1/Nboot) myLemRoc = list() myBymRoc = list() mySimSurface = kCases$raster[[grep("^relativeIntensity[[:digit:]]", names(kCases$raster))]] for(inS in 1:nsim) { ##simulation surface theSimSurface = mySimSurface[[inS]] names(theSimSurface) = 'relativeIntensity' ##ROC from local-EM analysis cat("Computing ROC curves for simulation:",inS,"...\n") cat("localEM...\n") lemExcProb = myLemExcProb[[grep(paste('(count|case)', inS, '_', sep = ''), names(myLemExcProb))]] names(lemExcProb) = gsub('^bw[[:digit:]]+\\_(count|case)[[:digit:]]+\\_', '', names(lemExcProb)) names(lemExcProb) = gsub('^threshold', 'threshold.', names(lemExcProb)) lemRoc = try( geostatsp::spatialRoc( fit = lemExcProb, rr = threshold, truth = theSimSurface, random = FALSE, spec = theSpec), silent = TRUE) if(class(lemRoc) != 'try-error') { myLemRoc[[inS]] = lemRoc[,] } cat("BYM...\n") ##ROC from BYM analysis bymExcProb = myBymExcProb[[grep(paste('(count|case)', inS, '_', sep = ''), names(myBymExcProb))]] names(bymExcProb) = gsub('^(count|case)[[:digit:]]+\\_', '', names(bymExcProb)) names(bymExcProb) = gsub('^threshold', 'threshold.', names(bymExcProb)) bymRoc = try( geostatsp::spatialRoc( fit = bymExcProb, rr = threshold, truth = theSimSurface, random = FALSE, spec = theSpec), silent = TRUE) if(class(bymRoc) != 'try-error') { myBymRoc[[inS]] = bymRoc[,] } } save(myLemRoc, myBymRoc, file = paste(path,"rocEstRKentucky.RData", sep = "")) cat(date(), '\n') cat('done', '\n') require(ggplot2) ###Plot roc2Plot = data.frame() for (i in 1:length(myLemRoc)){ rocP = data.frame(rbind(myLemRoc[[i]], myBymRoc[[i]])) rocP$model = c(rep("LEM",length(theSpec)),rep("BYM",length(theSpec))) rocPlong = reshape(rocP,varying = list(grep("^X",names(rocP))),idvar = c("onemspec", "model"), direction = "long", v.names = "Sensitivity",timevar = "threshold",times =as.character(gsub("^X","",names(rocP)[grep("^X",names(rocP))]))) rocPlong$sim = i roc2Plot = rbind(roc2Plot,rocPlong) } roc2Plot = aggregate(Sensitivity ~ onemspec + model + threshold, data = roc2Plot,mean,na.rm = T) p = ggplot(data = roc2Plot,aes(x = onemspec, y = Sensitivity, color = model, linetype = threshold, group = interaction(model, threshold))) + geom_line() + xlab("1-Specificity") print(p)
Nguyen P, Brown PE, Stafford J. Mapping cancer risk in southwestern Ontario with changing census boundaries. Biometrics. 2012; 68(4): 1229-37.
Besag J, York J, Mollie A. Bayesian image restoration, with two applications in spatial statistics. Ann Inst Statist Math. 1991; 43(1): 1-59.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.