knitr::opts_chunk$set(collapse = TRUE, comment = "R>", cache=FALSE, fig.width=7.2) rm(list=ls()) library(TexMix) library(spatstat) # Key library for spatial point pattern analysis library(smacpod) # Relative risk kernel densities based on statstat library(colorspace) # Used for thematic mapping of kernel densities library(car) # Regression diagnostics library(effects) # Effect plots to explore non-linearities library(sp) library(maptools) library(rgdal)
The key proposition of this analysis is that the spatial variability of food supply in an urban environment can be approximated by kernel densities around individual stores selling food. It is assumed that grocery stores offer nutritious food whereas convenience stores (including food aisles of gas stations etc.) predominately sell processed food comprising mainly of 'empty' calories.
The potential spatial extend of the store's market areas is proportional to the bandwidth of the kernel function. The estimated sales volume of food products is used as weight of the kernel density estimator. Usually grocery stores have a larger market area and a higher food sales volume than convenience stores.
General access to food is modeled by the combination of both store types, whereas access to quality food is modeled by grocery stores and access to inferior food is modeled by convenience stores.
The concept of food deserts can either be modeled by the general access to food or more specifically by the access to quality food. This study aims to evaluate both concepts. Log relative risk ratios of convenience stores against grocery stores identify spatial imbalances between both food sources.
One needs to bear in mind that the location of stores also has to follow the potential demand. Locations of insufficient demand cannot support food stores.
Several factors are ignored in this study:
POPDEN
.Kernel weights effectively increase or decrease the number of points or a fraction thereof at a location relative to the weights at other locations.
At any given distance from the point location just the intensity is modified by weights but not the spatial reach of the density estimator. Thus, weights have no impact the bandwidth.
The graph below compares the density at a location with a weight of 1 against that at a location with a weight of 3.
################################################### ### Impact of weights on Gaussian Kernel Density ################################################### x <- c(-5,4.9,5,5.1) nx <-length(x) bw <-1 fx <- density(x , bw=bw, kernel="gaussian") plot(fx, main="Impact on Weights on Gaussian Kernel Densities\nweight=1 versus weight=3 (equivalent to 3 points)") abline(h=0) points(x, rep(0, nx), col="red", cex=1.5, pch=20) abline(v=c(-5-bw,-5+bw, 5-bw,5+bw), lty=2)
The following figure calculates the densities for two store types and the evaluates the log relative risk ratio of both densities. Since the convenience stores are in the numerator of the log risk ratio, the log relative risk ratio is positive if the density of the convenience stores is higher than that of the grocery stores.
Note that this example does not make use of differential weights and varying bandwidths.
###################################################### ### Grovery versus Convenience Stores Kernel Densities ###################################################### bw <- 1 groc <- c(-1,-1.5,-2) conv <- c(1,1.5,2) both <- c(groc,conv) f.both <- density(both , bw=bw, kernel="gaussian") f.both$y <- f.both$y+0.006 f.groc <- density(groc , bw=bw, kernel="gaussian", from=min(f.both$x), to=max(f.both$x)) f.groc$y <- f.groc$y/2 f.conv <- density(conv , bw=bw, kernel="gaussian", from=min(f.both$x), to=max(f.both$x)) f.conv$y <- f.conv$y/2 f.risk <- log(f.conv$y/f.groc$y)/100 plot(f.both, ylim=c(-0.2,0.2), type="n", xlab="Store Locations & Cumulative Market Areas (bw=1)", ylab="Density & log(RR)", main="Market Areas by Store Types &\nlog(Relative Risks)") lines(f.conv, lwd=4, col="red") lines(f.groc, lwd=4, col="green") lines(f.both, lwd=4, col="blue") lines(f.both$x, f.risk, col="purple", lwd=4, lty=2) abline(h=0, lwd=4) points(conv, rep(0, length(conv)), col="red", cex=4, pch=20) points(groc, rep(0, length(groc)), col="green", cex=4, pch=20) legend("bottomright", legend = c("Grocery Stores", "Convenience Stores", "Joint Both Stores", "log(Relative Risk)"), col=c("green","red","blue","purple"), lwd=rep(4,4), bty="o", title="Store Types & log(RR)")
densityKernelFct
Since kernel densities are calculate on a fixed raster grid, the pixel values need to be aggregated into pixelated census tracts. Both the grid dimensions and the pixel dimensions of the census tracts need to match. This function is a slow performer due to the inefficient use of a for loop.
densityKernelFct <- function(kernel, tess){ ## ## Split a kernel density image into individual tessellations (also type image) ## and calculated for each tessellation descriptive statistics of the kernel ## density pixel values ## tract <- split(kernel, tess) # see Baddeley p 122 #sapply(tract, integral.im) # kernel sum in tract n <- length(tract) # number of tracts ## Initialized data-frame of results df <- data.frame(SeqId=1:n, nOfCells=NA, Sum=NA, Density=NA, varDens=NA) ## Cycle over tracts. Data values are in $v and may include NAs. ## $v is a matrix. for (i in 1:n){ Sum <- sum(tract[[i]]$v, na.rm = T) Cells <- length(na.omit(as.vector(tract[[i]]$v))) Density <- mean(as.vector(tract[[i]]$v), na.rm = T) varDens <- var(as.vector(tract[[i]]$v), na.rm = T) df[i,2:5] <- c(Cells, Sum, Density, varDens) } return(df) } ## end::densityKernelFct
The original maps are in the long-lat format. In order to perform isotropic distance calculations the maps need to be re-projected into the UTM coordinates system. Note that Dallas is in the UTM zone 14.
Lastly the point coordinates of the stores need to be converted into ppp-objects
of the spatstat
package.
## Check tracts unique sequence Id plot(tractShp, axes=T, main="Check SeqId") text(coordinates(tractShp), as.character(tractShp$SeqId), cex=0.5) ## Change the map coordinate system to UTM zone 14 (best fit for Texas) proj4string(bndShp) # Current projection system projUTM <- CRS("+proj=utm +zone=14 +units=m") # New coordinate sytem definition bndUTM <- spTransform(bndShp, projUTM) # Re-project boundary ptsUTM <- spTransform(foodStoresShp, projUTM) # Re-project store locations tractUTM <- spTransform(tractShp, projUTM) # Re-project tracts ## Converting UTM shape-files to ppp object and assign marks ptsDf <- as.data.frame(ptsUTM) pts <- as.ppp(ptsUTM) pts$marks <- NULL pts$marks <- ptsDf$STORETYPE #pts$marks <- factor(ptsDf$STORETYPE, labels=c("healhy","junk"))
spatstat
This code chunk sets the resolution of the pixels and converts the census tracts into a tessellation object. These tract tessellations are subsequently transformed into a raster image of matching dimensions.
## Define window with units and pixel resolution win <- as.mask(as.owin(bndUTM), eps=200) # Width and height of pixels = 200 m unitname(win) <- list("meter","meters") pts <- pts[win] # assign owin to pts summary(pts) ## Convert tracts to tesselation object tractDf <- as.data.frame(tractUTM) # Save the attribute informaton tracts <- slot(tractUTM, "polygons") tracts <- lapply(tracts, function(x) { SpatialPolygons(list(x)) }) tracts <- lapply(tracts, as.owin) tractsTess <- tess(tiles=tracts) ## Converting tesselation to grid image with 200x200 meters cell size tractsTessIm <- tess(tiles=tracts, window=win)
The food supply densities in Dallas County are evaluated for grocery stores, convenience stores and the overall food supply by merging both store types. Each store is weighted by it food related sales volume.
It is assumed that each grocery store's market area is larger than that of convenience stores. Thus the band width of grocery stores is 4000 meters and that of convenience stores only 2000 meters. The compromise band width of 3000 meters is selected when evaluating both store types together. Other bandwidths for the store's market areas can be considered.
The Gaussian kernel function implies that approximately 68 % of the customers travel less than one band width unit to the closest store.
############################################################################## ## Sample code evaluating the food store market areas by food sales weighted ## kernel density estimates ############################################################################## ## subset cases and controls groc <- subset(pts, marks=="Healthy Food") conv <- subset(pts, marks=="Junk Food") grocFoodSales <- ptsDf[ptsDf$STORETYPE=="Healthy Food","FOODSALES"] convFoodSales <- ptsDf[ptsDf$STORETYPE=="Junk Food","FOODSALES"]
## Evaluate grocery food stores by census tract (b=4000) goodFoodIm <- density(groc, weights=grocFoodSales, sigma=4000) summary(goodFoodIm) plot(goodFoodIm, main="Weighted Nutritious Food Store Kernel Density\nbw = 4000 m") plot(tractsTess, add=T) plot(groc, cex=0.5, pch=16, col="green", add=T) box(); axis(1); axis(2) grocFoodStats <- densityKernelFct(goodFoodIm, tractsTessIm) summary(grocFoodStats) tractShp$good4000D <- grocFoodStats$Density tractShp$good4000V <- grocFoodStats$varDens mapColorRamp(tractShp$good4000D, tractShp, breaks=8, legend.cex = 0.6, map.title="Integrated Nutritious Food Store WKD\nbw=4000", legend.title= "Grocery Food\nStores", add.to.map=F) plot(lakesShp, col="skyblue", border="skyblue",add=T) plot(hwyShp, col="cornsilk3", lwd=3, add=T) plot(foodDesertShp, border="magenta",lwd=2, add=T)
## Evaluate convenience food stores by census tract (bw=2000) badFoodIm <- density(conv, weights=convFoodSales, sigma=2000) plot(badFoodIm, main="Weighted Processed Food Store Kernel Density\nbw = 2000") plot(tractsTess, add=T) plot(conv, cex=0.5, pch=16, col="bisque", add=T) box(); axis(1); axis(2) badFoodStats <- densityKernelFct(badFoodIm, tractsTessIm) summary(badFoodStats) tractShp$bad2000D <- badFoodStats$Density tractShp$bad2000V <- badFoodStats$varDens mapColorRamp(tractShp$bad2000D, tractShp, breaks=8, legend.cex = 0.6, map.title="Integrated Processed Food Store WKD\nbw=2000", legend.title= "Convenience Food\nStores", add.to.map=F) plot(lakesShp, col="skyblue", border="skyblue",add=T) plot(hwyShp, col="cornsilk3", lwd=3, add=T) plot(foodDesertShp, border="magenta",lwd=2, add=T)
## Evaluate both store types together (bw=3000) bothStoresIm <- density(unmark(pts), weights=ptsDf$FOODSALES, sigma=3000) plot(bothStoresIm, main="Weighted All Stores Kernel Density\nbw = 3000 m") plot(tractsTess, col="grey", add=T) plot(pts, cex=0.5, pch=16, col="green", add=T) box(); axis(1); axis(2) allFoodStats <- densityKernelFct(bothStoresIm, tractsTessIm) summary(allFoodStats) tractShp$nCells <- allFoodStats$nOfCells tractShp$all3000D <- allFoodStats$Density tractShp$all3000V <- allFoodStats$varDens mapColorRamp(tractShp$all3000D, tractShp, breaks=8, legend.cex = 0.6, map.title="Integrated All Food Stores WKD\nbw=3000", legend.title= "All Food\nStores", add.to.map=F) plot(lakesShp, col="skyblue", border="skyblue",add=T) plot(hwyShp, col="cornsilk3", lwd=3, add=T) plot(foodDesertShp, border="magenta",lwd=2, add=T)
Here the relative risk ratios per pixel and census tract are calculated and the significance of over- or under-supply of convenience stores relative to grocery stores are evaluated by a randomization algorithm.
Note that in the sparsely populated South-East sector of Dallas just a few convenience stores are located along I45 and no grocery store. This leads to an unrealistic large log-relative risk ratio. In contrast the Park Cities in the center of Dallas County lack convenience stores thus making the log-relative risk ratio significantly negative.
## ## Relative Risk evaluation with smacpod::logrr ## riskMap <- logrr(pts, case=2, sigma=2000, sigmacon=4000, weights=ptsDf$FOODSALES) bound <- max(abs(range(riskMap$v, na.rm=TRUE))) plot(riskMap, main="Relative Weighted Risk Surface\nlog(Convenience/Grocery)", breaks=seq(-bound, bound, length.out=16+1), col=diverge_hsv(16)) plot(tractsTess, col="grey", add=T) box(); axis(1); axis(2) ## Evaluate risk by census tract riskStats <- densityKernelFct(riskMap, tractsTessIm) summary(riskStats) tractShp$LRRmedD <- riskStats$Density tractShp$LRRmedV <- riskStats$varDens mapBiPolar(tractShp$LRRmedD, tractShp, neg.breaks=4, pos.breaks=6, legend.cex = 0.6, map.title="Integrated Relative Weighted Risk Surface with High BW", legend.title= "Log Relative Risk", add.to.map=F) plot(lakesShp, col="turquoise", border="turquoise",add=T) plot(hwyShp, col="cornsilk3", lwd=3, add=T) plot(foodDesertShp, border="magenta",lwd=2, add=T)
## Identify areas exessive and lower risk ## decrease error prob: alpa=1-level riskMapSig <- logrr(pts, case=2, level=0.95, alternative="two.sided", sigma=2000, sigmacon=4000, nsim=99, weights=ptsDf$FOODSALES) bound <- 5 plot(riskMapSig, main="Significant Relative Risk Surface at 5%\nWhite denotes insignificance", breaks=seq(-bound, bound, length.out=16+1), col=diverge_hsv(16)) plot(tractsTess, add=T) box(); axis(1); axis(2) ## Global Test logrr.test(riskMapSig)
The last code chunk models the general food supply within census tracts using a
set of socio-economic and demographic variables.
First, however, those census tracts without population need to be excluded from the analysis. These are the airports of Love Field and DFW International.
## Identify two tracts without night population and map them tractShp@data[tractShp$NIGHTPOP <= 0, "SeqId" ] tractShp$airport <- 0 tractShp$airport[c(154,248)] <- 1 tractShp$airport <- factor(tractShp$airport, labels=c("no","airport")) mapColorQual(tractShp$airport, tractShp, legend.cex=0.7, legend.title = "Airport\nTracts", map.title = "Airport Census Tracts\nto be Excluded from the Analysis") plot(lakesShp, col="blue", border="blue",add=T) plot(hwyShp, col="red", lwd=3, add=T) ## Remove both records from tractShp tractShp <- tractShp[-c(154,248),]
The following tasks are performed:
A scatter-plot matrix explores the distribution of the variables.
A preliminary regression model aims at explaining the distribution of the food supply density.
Non-linearities are evaluated with the residualPlot
function. It suggests
that log(PCTDAYPOP)
should be specified as a quadratic function.
Effect plots visualize the non-linear relationships.
The function influenceIndexPlot
checks for outliers and influential
observations.
## ## Save augmented (with Density Estimates) Census Tract shape file ## # rgdal::writeOGR(tractShp, dsn=getwd(), layer="DallasFoodTracts", # overwrite_layer=T, driver="ESRI Shapefile") # # tractShp <- rgdal::readOGR(dsn=".", layer="DallasFoodTracts", # integer64="warn.loss") ## Preliminary regression model car::scatterplotMatrix(~all3000D+log(POPDEN)+log(PCTDAYPOP)+log(PCTBLACK+1)+PCTUNIVDEG+ PCTBADENG +log(PCTHUVAC+1)+log(PCTNOVEH+1)+PCTNOHINS, data=tractShp, main="Potential Variables Impacting the Store-Density", pch=20, smooth=list(span = 0.35,lty.smooth=1, col.smooth="red", col.var="red"), regLine=list(col="green")) all3000.lm1 <- lm(all3000D~log(POPDEN)+log(PCTDAYPOP)+log(PCTBLACK+1)+PCTUNIVDEG+ PCTBADENG +log(PCTHUVAC+1)+log(PCTNOVEH+1)+PCTNOHINS, data=tractShp) summary(all3000.lm1) car::vif(all3000.lm1) car::residualPlots(all3000.lm1) all3000.lm2 <- lm(all3000D~log(POPDEN)+log(PCTDAYPOP)+I(log(PCTDAYPOP)^2)+log(PCTBLACK+1)+ PCTUNIVDEG+PCTBADENG +log(PCTHUVAC+1)+log(PCTNOVEH+1)+PCTNOHINS, data=tractShp) summary(all3000.lm2) plot(allEffects(all3000.lm2)) car::influenceIndexPlot(all3000.lm2, id=list(n=3))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.