#devtools::load_all("../pkg/localem") #knitr::knit("../pkg/localem/inst/doc/toronto.Rmd", 'toronto.md')
baseDir = file.path('/store',Sys.info()['user'],'localem/toronto') mapDir = file.path(baseDir, 'mapCache', '') path = file.path(baseDir, 'highResLocalem', '') cacheDir = file.path(baseDir, 'highResCache', '') figDir = file.path('figureToronto', '') dir.create(mapDir, showWarnings = FALSE, recursive = TRUE) dir.create(path, showWarnings = FALSE, recursive = TRUE) dir.create(cacheDir, showWarnings = FALSE, recursive = TRUE) dir.create(figDir, showWarnings = FALSE, recursive = TRUE) knitr::opts_chunk$set( dev = 'png', fig.width = 4, fig.height = 2.5, dpi = 100, out.width = '45%', fig.ncol = 2, fig.path=figDir, cache.path=cacheDir, margins=1 )
if(requireNamespace('Pmisc', quietly = TRUE)) { file.copy( system.file('src','webpage.css', package='Pmisc'), 'webpage.css') cat(Pmisc::markdownHeader( title = 'High Resolution Local-EM Example for Toronto', author = 'Lennon Li, Patrick Brown', css = 'webpage.css', geometry = 'margin=1in')) knitr::knit_hooks$set(plot = Pmisc::hook_plot_mdsubfig) knitr::knit_hooks$set(margins = Pmisc::hook_plot_margins) } else { knitr::knit_hooks$set(plot = knitr::hook_plot_html) knitr::opts_chunk$set(out.width = '80%') cat('\n# Local-EM Example for Toronto\n') }
library('rgdal') library('raster') library('localEM') library('mapmisc') options(mapmiscCachePath = mapDir)
load('gtaFsaCt2007.RData') gtaOrigFsa = mapmisc::omerc(gtaOrigFsa, angle = -17) gtaSelectFsa = spTransform(gtaSelectFsa, gtaOrigFsa) gtaSelectFsa = gtaSelectFsa[substr(gtaSelectFsa$id, 1, 1) == 'M',] gtaOrigCt = spTransform(gtaOrigCt, gtaOrigFsa) gtaOrigCt$expected = 8 * gtaOrigCt$expected
kMap = mapmisc::tonerToTrans( mapmisc::openmap(gtaSelectFsa, fact=2, path='stamen-toner'))
The localEM
package contains functions to implement the kernel smoothing local-EM algorithm$^1$ of disease data aggregated to geographical regions. This algorithm provides an nonparametric alternative to the standard geospatial models, such as the Besag-York-Mollie (BYM) model$^2$, for estimating spatial risk of areal disease data. With disease cases typically aggregated to highly coarse geographical regions (e.g., census counties, or census subdivisions), the local-EM method creates a tessellation of distinct regions by overlaying the map of these coarse regions with another map containing fine geographical regions (e.g., census tracts, census blocks, or census dissemination areas) of population data. This allows for the spatial risk to be estimated at a better resolution with the fine regions.
The local-EM kernel smoothing algorithm of the localEM
package was tested on lupus diagnoses in the City of Toronto of the GTA (i.e., forward sortation areas (FSAs) starting with 'M'). Lupus cases were simulated using the population data (i.e., the expected lupus cases) at the census tract level (see Figure 1) with a total expectation of 1849 cases for the study area
ncores = min(c(16, parallel::detectCores())) cellsFine = 800 cellsCoarse = 40 nsim = 12 nxv = 6 Sbw = c(0.5, 0.75, 1, 1.25, 2, 2.5, 3,4,6) *1000 focalSize = min(c(15*1000, max(Sbw)*1.5)) Sthreshold = c(1.1, 1.25, 1.5) Nboot = 100 range = 10000 set.seed(100)
specify number of grid cells for simulation
if(requireNamespace('RandomFields', quietly = TRUE)) { cellsSimulate = 800 } else { cellsSimulate = 100 }
Using the simLgcp()
function from the geostatsp
package, case locations are simulated with the log Gaussian Cox process (LGCP) and following parameters:
theLogExpectedRaster = geostatsp::spdfToBrick( gtaOrigCt, geostatsp::squareRaster(gtaOrigCt, cellsSimulate), pattern = '^expected$', logSumExpected = TRUE) theLogExpectedRaster = raster::trim(raster::mask(theLogExpectedRaster, gtaSelectFsa)) set.seed(0) theSimData = geostatsp::simLgcp( param = c(mean = 0, variance = 0.4^2, range = range, shape = 2), covariates = list(logExpected = theLogExpectedRaster) , offset = 'logExpected', n = nsim)
The simulated cases are then aggregated to the appropriate FSAs Plots of the relative intensity, event locations and aggregated data are provided for the first simulated dataset.
##relative intensity surfaces theRasterNames = names(theSimData$raster) mySimSurfaceGTA = theSimData$raster[[grep('^relativeIntensity', theRasterNames)]] ##case locations mySimSpGTA = theSimData[grep('^events[[:digit:]]+?', names(theSimData))] theSimCases = lapply( theSimData[grep('^events[[:digit:]]+?', names(theSimData))], function(qq) over(qq, gtaSelectFsa)[,'id']) mySimCasesGTA = as.data.frame(lapply( theSimCases, function(xx) as.vector(table(xx, exclude = NULL)[as.character(gtaSelectFsa$id)]))) mySimCasesGTA[is.na(mySimCasesGTA)] = 0 colnames(mySimCasesGTA) = gsub('^events', 'count', colnames(mySimCasesGTA)) rownames(mySimCasesGTA) = as.character(gtaSelectFsa$id) gtaSelectFsa= merge(gtaSelectFsa, mySimCasesGTA, by.x = 'id', by.y = 'row.names') ##log-offsets myLogExpectedGTA = theSimData$raster[['logExpected']] save(myLogExpectedGTA, mySimSurfaceGTA, mySimSpGTA, mySimCasesGTA, file = file.path(path, paste('simCasesR', range, 'mGTA.RData', sep = '')) )
# theOffsets oCol = colourScale(myLogExpectedGTA, breaks = 6, style = 'equal', dec = 1, revCol = FALSE) map.new(gtaSelectFsa, mar = c(0, 0, 0, 0), xaxt = 'n', yaxt = 'n') plot(myLogExpectedGTA, col = oCol$col, breaks = oCol$breaks, legend = FALSE, add = TRUE) plot(kMap, add = TRUE) legendBreaks('bottomright', breaks = oCol$breaks, col = oCol$col, cex = 0.7, bg = 'white') # simulated risk map iCol = colourScale( theSimData$raster$relativeIntensity1, breaks = 8, style = 'equal', dec = -log10(0.5)) map.new(gtaSelectFsa#, # mar = c(0, 0, 1, 0), # main = 'Simulated Risk' ) plot(theSimData$raster$relativeIntensity1, col = iCol$col, breaks = iCol$breaks, legend = FALSE, add = TRUE) plot(kMap, add = TRUE) legendBreaks('bottomright', col = iCol$col, breaks = iCol$breaks, title = 'RR', bg = 'white') # simulated events map map.new(gtaSelectFsa#, # mar = c(0, 0, 1, 0), # main = 'Simulated Events' ) plot(kMap, add = TRUE) points(theSimData$events1, col = '#FF000030', pch = 20, cex = 0.5) scaleBar(gtaSelectFsa, pos = 'bottomright', bg = 'white', cex=0.6) # simulated counts cCol = colourScale( gtaSelectFsa$count1, breaks = 10, style = 'quantile', dec = -1) map.new(gtaSelectFsa#, # mar = c(0, 0, 1, 0), # main = 'Aggregated Counts' ) plot(gtaSelectFsa, col = cCol$plot, add = TRUE) plot(kMap, add = TRUE) legendBreaks('bottomright', cCol, cex=0.7)
The local-EM algorithm requires a smoothing parameter called the bandwidth to estimate the spatial risk. Small values of the bandwidth yield estimates similar to standardized incidence ratios of each areal regions, while large values yield estimates to the overall mean incidence ratio of the entire study area (i.e., [total counts]/[total offsets]). The preferred or optimal bandwidth for the disease data is one that minimizes the trade-off between the bias and variance of the estimator.
To automatic the selection of the optimal bandwidth, the lemXv()
function of this package implements a likelihood cross-validation (CV) approach with the set of specified bandwidths. CV scores are computed with $k$-fold sampling without replacement of the dataset. The optimal bandwidth is the one that yields the smallest CV score.
The CV scores with 4-fold sampling are provided for the first simulated dataset.
rasterFile = file.path(path, 'rasterPartition.rds') if(!file.exists(rasterFile)) { lemRasterGTA = rasterPartition( polyCoarse = gtaSelectFsa, polyFine = gtaOrigCt, cellsCoarse = cellsCoarse, cellsFine = cellsFine, bw = Sbw, ncores = ceiling(ncores/4), focalSize = focalSize, xv = nxv, path = path, idFile = 'lemId.grd', offsetFile = 'lemOffsets.grd', verbose = TRUE) saveRDS(lemRasterGTA, rasterFile) } else { lemRasterGTA = readRDS(rasterFile) }
smoothingMatrixFile = file.path(path, 'smoothingMatrix.rds') if(!file.exists(smoothingMatrixFile)) { lemSmoothMatGTA = smoothingMatrix( rasterObjects = lemRasterGTA, ncores = ncores, path = path, filename = 'lemSmoothMat.grd', verbose = TRUE) saveRDS(lemSmoothMatGTA, smoothingMatrixFile) } else { lemSmoothMatGTA = readRDS(smoothingMatrixFile) }
lemXvFile = file.path(path, 'lemXv.rds') if(!file.exists(lemXvFile)) { lemXvGTA = lemXv( cases = gtaSelectFsa@data[,grep('^count[[:digit:]]', names(gtaSelectFsa))], lemObjects = lemSmoothMatGTA, ncores = ceiling(ncores/4), iterations = list(tol = 1e-7, maxIter = 3000, gpu = FALSE), randomSeed = range, path = path, filename = paste('lemRiskR', range , 'm.grd', sep = ''), verbose = TRUE) saveRDS(lemXvGTA, lemXvFile) } else { lemXvGTA = readRDS(lemXvFile) } myLemXvGTA = lemXvGTA$xv myLemRiskGTA = lemXvGTA$riskEst
maxY = min(max(lemXvGTA$xv[,-1]), 30) Scol = RColorBrewer::brewer.pal(4, "Dark2") ScolFull = c(Scol, rep(mapmisc::col2html('grey', 0.5), nrow(lemXvGTA$xv) - length(Scol) ) ) matplot(lemXvGTA$xv[,'bw']/1000, lemXvGTA$xv[,-1], # main = 'Cross validation for all Simulations', xlab = 'km', ylab = '-log p', type = 'l', lty = 1, col = ScolFull, ylim = c(0, 10), log='x')
toPlot = brick(filename(lemXvGTA$riskEst)) rCol = colourScale( breaks = c(seq(0,3, by=0.5), 4), style = 'fixed') #bw = as.numeric(gsub('^bw', '', xvAllKentucky$bw)) for(inR in 1:min(c(5,nlayers(toPlot)))) { map.new(gtaSelectFsa)#, # mar = c(0, 0, 1, 0), # main = paste('Simulation ', inR, ': Estimated Risk, bw=', bw[inR] / 1000, ' km', sep = '')) plot(toPlot[[inR]], col = rCol$col, breaks = rCol$breaks, legend = FALSE, add = TRUE) plot(kMap, add = TRUE) map.new(gtaSelectFsa#, # mar = c(0, 0, 1, 0), # main = 'Simulated Risk' ) plot(theSimData$raster[[paste('relativeIntensity',inR, sep='')]], col = rCol$col, breaks = rCol$breaks, legend = FALSE, add = TRUE) plot(kMap, add = TRUE) } legendBreaks('topleft', col = rCol$col, breaks = rCol$breaks, bg = 'white')
To measure the uncertainty of the local-EM algorithm, the excProb()
function
computes the exceedance probabilities with the same bandwidth parameter used in
the risk estimation. Bootstrapping from a Poisson process is used to simulate
the events for calculating these exceedance probabilities.
Specifically, under the assumption that disease events are a realisation from the background population and constant risk threshold, cases are bootstrapped and randomly aggregated to the areal regions. Using the same bandwidth as the observed data, the local-EM risk is then estimated for each of the bootstrapped data. Afterwards, exceedance probabilities are computed as the proportion of the observed risk estimate at least as large as the ones of the bootstrap data. Large exceedance probabilities are consistent with the risk being greater than the specified threshold.
#Compute excProb for localEM theFileExcProb = file.path(path, paste('lemExcProbR.grd', sep = '')) if(!file.exists(theFileExcProb)) { Sbw = unique(lemXvGTA$bw) theLemExcProbStack = raster(lemXvGTA$riskEst) for(Dbw in Sbw) { cat('Computing localEM exceedance probabilities for simulation:',Dbw, '\n') excProb = localEM::excProb( lemObjects = lemXvGTA, threshold = Sthreshold, Nboot = Nboot, bw = Dbw, fact = 4, ncores = 1, iterations = list(tol = 1e-7, maxIter = 3000, gpu = FALSE), path = path, filename = paste('lemExcProbR', Dbw, '.grd', sep = ''), verbose = TRUE) theLemExcProbStack = raster::addLayer( theLemExcProbStack, excProb$excProb) } myLemExcProb = raster::writeRaster( theLemExcProbStack, filename = theFileExcProb, overwrite = file.exists(theFileExcProb)) } else { myLemExcProb = brick(theFileExcProb) }
Compute exceedance probabilities using BYM model:
set up BYM
idCoarse = 1:length(lemSmoothMatGTA$polyCoarse) offsetRaster = raster::stack(lemSmoothMatGTA$offset$offset, raster::deratify(lemSmoothMatGTA$rasterFine)) offsetDf = stats::aggregate(x = values(offsetRaster$offset), by = list(idCoarse = values(offsetRaster$idCoarse)), FUN = sum) colnames(offsetDf) = c('idCoarse','offset') offsetDf = merge(data.frame(idCoarse = idCoarse), offsetDf, by = 'idCoarse', all = TRUE) offsetDf$offset[is.na(offsetDf$offset)] = 0 offsetDf$expected = offsetDf$offset * prod(res(offsetRaster$offset)) offsetDf$logExpected = log(offsetDf$expected) offsetDf$logExpected[is.infinite(offsetDf$logExpected)] = NA offsetDf$id = lemSmoothMatGTA$polyCoarse$id gtaSelectFsaForBYM = merge( gtaSelectFsa[,setdiff(names(gtaSelectFsa), 'logExpected')], offsetDf, by = 'id') theBymRiskList = theBymExcProbList = as.list(1:nsim) names(theBymRiskList) = names(theBymExcProbList) = paste("count",1:nsim,sep="")
Computing risk estimation and exceedance probabilities using BYM model
library('INLA') for (i in 1:nsim){ # cat("Fitting BYM model for simulation:", i,"...\n") formula = as.formula(paste('count', i, ' ~ offset(logExpected)', sep = '')) kBYM = diseasemapping::bym(formula, data = gtaSelectFsaForBYM, priorCI = list(sd = c(1, 0.05), propSpatial = c(0.2, 0.95))) #raster bym estimate kBYMRaster = raster::rasterize(kBYM$data, y = toPlot[[i]], field = 'fitted.exp') names(kBYMRaster) = 'fitted.exp' theBymRiskList[[i]] = kBYMRaster #compute exceedance probs theBymMarg =kBYM$inla$marginals.fitted.bym for(inT in 1:length(Sthreshold)) { kBYM$data$excProb = geostatsp::excProb(theBymMarg, threshold = log(Sthreshold[inT])) theBymExcProb = raster::rasterize(kBYM$data, y = toPlot[[i]], field = 'excProb') if(inT == 1) { bymExcProbRaster = theBymExcProb } else { bymExcProbRaster = raster::addLayer(bymExcProbRaster, theBymExcProb) } } names(bymExcProbRaster) = paste('count', i, '_threshold', Sthreshold, sep = '') theBymExcProbList[[i]] = bymExcProbRaster } ##output results to external file myBymRisk = raster::writeRaster( raster::stack(theBymRiskList), filename = file.path(path, 'bymRiskR.grd'), overwrite = file.exists(file.path(path, 'bymRiskR.grd'))) myBymExcProb = raster::writeRaster( raster::stack(theBymExcProbList), filename = file.path(path, 'bymExcProbR.grd'), overwrite = file.exists(file.path(path, 'bymExcProbR.grd')))
Compute ROC curves for both LocalEM and BYM model:
rocFile = paste(path,"roc.RData", sep = "") if(!file.exists(rocFile)) { theSpec = seq(0, 1, by = 1/Nboot) myLemRoc = list() myBymRoc = list() mySimSurface = theSimData$raster[[grep("^relativeIntensity[[:digit:]]", names(theSimData$raster))]] for(inS in 1:nsim) { ##simulation surface theSimSurface = mySimSurface[[inS]] names(theSimSurface) = 'relativeIntensity' ##ROC from local-EM analysis cat("Computing ROC curves for simulation:",inS,"...\n") cat("localEM...\n") lemExcProb = myLemExcProb[[ grep(paste0('(count|case)', inS, '_'), names(myLemExcProb)) ]] names(lemExcProb) = gsub('^bw[[:digit:]]+\\_(count|case)[[:digit:]]+\\_', '', names(lemExcProb)) names(lemExcProb) = gsub('^threshold', 'threshold.', names(lemExcProb)) lemRoc = try( geostatsp::spatialRoc( fit = lemExcProb, rr = Sthreshold, truth = theSimSurface, random = FALSE, spec = theSpec), silent = TRUE) if(class(lemRoc) != 'try-error') { myLemRoc[[inS]] = lemRoc[,] } cat("BYM...\n") ##ROC from BYM analysis bymExcProb = myBymExcProb[[grep(paste('(count|case)', inS, '_', sep = ''), names(myBymExcProb))]] names(bymExcProb) = gsub('^(count|case)[[:digit:]]+\\_', '', names(bymExcProb)) names(bymExcProb) = gsub('^threshold', 'threshold.', names(bymExcProb)) bymRoc = try( geostatsp::spatialRoc( fit = bymExcProb, rr = Sthreshold, truth = theSimSurface, random = FALSE, spec = theSpec), silent = TRUE) if(class(bymRoc) != 'try-error') { myBymRoc[[inS]] = bymRoc[,] } } #loop through Nsim cat(date(), '\n') cat('done', '\n') roc2Plot = data.frame() for (i in 1:length(myLemRoc)){ rocP = data.frame(rbind(myLemRoc[[i]], myBymRoc[[i]])) rocP$model = c(rep("LEM",length(theSpec)),rep("BYM",length(theSpec))) rocPlong = reshape(rocP,varying = list(grep("^X",names(rocP))),idvar = c("onemspec", "model"), direction = "long", v.names = "Sensitivity",timevar = "threshold",times =as.character(gsub("^X","",names(rocP)[grep("^X",names(rocP))]))) rocPlong$sim = i roc2Plot = rbind(roc2Plot,rocPlong) } roc2Plot = aggregate(Sensitivity ~ onemspec + model + threshold, data = roc2Plot,mean,na.rm = T) save(roc2Plot, file = rocFile) } else { load(rocFile) } roc2reshape = reshape2::dcast( roc2Plot, onemspec + threshold ~ model, value.var='Sensitivity') Sthreshold = unique(roc2reshape$threshold) Scol = c("BYM"='blue',"LEM"='red')
for(D in Sthreshold) { Shere = which(roc2reshape$threshold == D) matplot( roc2reshape[Shere,'onemspec'], roc2reshape[Shere, names(Scol)], type='l', xlim = c(0, 0.4), ylim = c(0.6, 1), xlab = '1-spec', ylab='sens', lty = 1, col = Scol ) legend('bottomright', col=Scol, lty=1, lwd=4, legend = names(Scol), bty='n') }
Nguyen P, Brown PE, Stafford J. Mapping cancer risk in southwestern Ontario with changing census boundaries. Biometrics. 2012; 68(4): 1229-37.
Besag J, York J, Mollie A. Bayesian image restoration, with two applications in spatial statistics. Ann Inst Statist Math. 1991; 43(1): 1-59.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.