In pbiecek/SPAG: Indexes of spatial agglomeration

title: "Distance Index Analysis (vignette)" date: "r Sys.Date()" output: rmarkdown::html_vignette: toc: true vignette: > %\VignetteIndexEntry{SPAG tutorial} %\VignetteEngine{knitr::rmarkdown} %\VignetteDepends{Cairo} %\VignetteEncoding{UTF-8} \usepackage[utf8]{inputenc}

Construction of Distance Index

knitr::opts_chunk$set(echo = TRUE)

Distance Index, bêd¹cy czêœci¹ wyliczanego indeksu SPAG jest wyliczany poprzez wyznaczenie stosunku œredniej odleg³oœci pomiêdzy firmami do œredniej odleg³oœci pomiêdzy firmami w przypadku gdy by³yby roz³o¿one w sposób jednorodny na obszarze. Taki sposób zdefiniowania tego indeksu sprawia pewne trudnoœci - w algorytmie korzysta siê ze œredniej odleg³oœci pomiêdzy firmami, który ma z³o¿onoœæ pamiêciow¹ wynosz¹c¹ $O(n^2)$, co sprawia, ¿e nawet dla niewielkiej liczby firm wyznaczenie wartoœci tego indeksu mo¿e okazaæ siê zbyt czasoch³onne dla przeciêtnego komputera. Problem z czasem wyliczania indeksu wydaje siê byæ na tyle powa¿ny, ¿e niezbêdne jest opracowanie bardziej wydajnego sposobu jego otrzymywanie. W poni¿szym opracowaniu przedstawiona jest analiza otrzymywanych wartoœci indeksu Distance, w przypadku, gdy jest on wyliczany jedynie na pewnym podzbiorze firm.

Funkcja Distance

Na potrzeby dalszych analiz funkcja wyliczaj¹ca indeks Distance zosta³a uproszczona, tak aby odleg³oœæ w przypadku jednorodnego rozmieszczenia firm by³a podawana jako argument funkcji.

calcDistanceIndex <- function(coordsCategoryDF, region, theoreticalMean, empiricalCompaniesSample){
  currentCategory <- coordsCategoryDF[,c(1,2)]
  k = nrow(currentCategory)
  liczbaPraw <- min(empiricalCompaniesSample,k)
  indeksy<-sample(1:k, liczbaPraw, replace = FALSE)
  currentCategoryFinal <- currentCategory[as.vector(indeksy),]
  IDist <- mean(dist(currentCategoryFinal))/theoreticalMean
  return(IDist)
}

Zbióry testowe.

Na potrzeby testów wykorzystany zosta³ zbiór ok 37 tysiêcy firm znajduj¹cych siê w województwie lubelskim, a tak¿e mapa województwa Lubelskiego. Testy zosta³y przeprowadzone na podzbiorach ró¿nej licznoœci.

library(tidyr)
library(SPAG)
setwd("C:/Users/Max/Desktop/TestEmpiryczny")
dane<-read.csv("geoloc data.csv", header=TRUE, sep=";", dec=".")

ShapefileDF<-as.data.frame(ShapefilePoland)
region<-ShapefilePoland#[ShapefileDF$jpt_nazwa_=="lubelskie",]
newCoordinateSystem<-"+proj=longlat +datum=WGS84"
region<-spTransform(region, CRS(newCoordinateSystem))
mapDF <- fortify(region)

Pierwszy zestaw testowy - zbiór punktóW w jednym miejscu:

TestCompanies1 <- dane[dane$SEK_PKD7 %in% c("C"),c(23,24)] #  Kategoria C 1327 firm
TestCompanies2 <- dane[dane$SEK_PKD7 %in% c("C", "M", "Q","S"),c(23,24)]
TestCompanies3 <- dane[dane$SEK_PKD7 %in% c("C", "M", "Q","S", "G", "O"),c(23,24)]
TestCompanies4 <- dane[dane$SEK_PKD7 %in% c("A"),c(23,24)]
cat(paste("Liczebnoœæ zbioru 1:", nrow(TestCompanies1), "\nLiczebnoœæ zbioru 2:",nrow(TestCompanies2),
            "\nLiczebnoœæ zbioru 3:",nrow(TestCompanies3),"\nLiczebnoœæ zbioru 4:",nrow(TestCompanies4)))

Drugi zestaw testowy - dwa skupiska punktów, w stosunku 1:5

df1 <- as.data.frame(cbind(TestCompanies1[1:275,1]-6,TestCompanies1[1:275,2]+2))
df2 <- TestCompanies1[276:nrow(TestCompanies1),c(1,2)]
names(df1) <- names(df2)
TestCompanies5 <- rbind(df1,df2)

ggplot() +
  geom_polygon(data=mapDF, aes(long, lat, group=group), colour='#808080', fill=NA) +
  geom_point(data=TestCompanies5, aes(coords.x1,coords.x2)) +
  theme_nothing() +
  labs(long="longitude", lat="latitude")

Trzeci zestaw testowy - dwa skupiska punktów, w stosunku 1:5. Liczba punktów wynosi 5010.

df1 <- as.data.frame(cbind(TestCompanies2[1:1002,1]-6,TestCompanies2[1:1002,2]+2))
df2 <- TestCompanies2[1003:nrow(TestCompanies2),c(1,2)]
names(df1) <- names(df2)
TestCompanies6 <- rbind(df1,df2)
ggplot() +
  geom_polygon(data=mapDF, aes(long, lat, group=group), colour='#808080', fill=NA) +
  geom_point(data=TestCompanies6, aes(coords.x1,coords.x2)) +
  theme_nothing() +
  labs(long="longitude", lat="latitude")

Czwarty zestaw testowy - dwa skupiska punktów, w stosunku 1:5. Liczba punktów wynosi 10016

df1 <- as.data.frame(cbind(TestCompanies3[1:2003,1]-6,TestCompanies3[1:2003,2]+2))
df2 <- TestCompanies3[2004:nrow(TestCompanies3),c(1,2)]
names(df1) <- names(df2)
TestCompanies7 <- rbind(df1,df2)
ggplot() +
  geom_polygon(data=mapDF, aes(long, lat, group=group), colour='#808080', fill=NA) +
  geom_point(data=TestCompanies7, aes(coords.x1,coords.x2)) +
  theme_nothing() +
  labs(long="longitude", lat="latitude")

Pi¹ty zestaw testowy - dwa skupiska punktów, w stosunku 1:5. Liczba punktów wynosi 20864

df1 <- as.data.frame(cbind(TestCompanies4[1:4173,1]-6,TestCompanies4[1:4173,2]+2))
df2 <- TestCompanies4[4174:nrow(TestCompanies4),c(1,2)]
names(df1) <- names(df2)
TestCompanies8 <- rbind(df1,df2)
ggplot() +
  geom_polygon(data=mapDF, aes(long, lat, group=group), colour='#808080', fill=NA) +
  geom_point(data=TestCompanies8, aes(coords.x1,coords.x2)) +
  theme_nothing() +
  labs(long="longitude", lat="latitude")

Szósty zestaw testowy - dwa skupiska punktów, w stosunku 1:20. Liczba punktów wynosi 20864:

df1 <- as.data.frame(cbind(TestCompanies4[1:1000,1]-6,TestCompanies4[1:1000,2]+2))
df2 <- TestCompanies4[1001:nrow(TestCompanies4),c(1,2)]
names(df1) <- names(df2)
TestCompanies9 <- rbind(df1,df2)

ggplot() +
  geom_polygon(data=mapDF, aes(long, lat, group=group), colour='#808080', fill=NA) +
  geom_point(data=TestCompanies9, aes(coords.x1,coords.x2)) +
  theme_nothing() +
  labs(long="longitude", lat="latitude")

Siódmy zestaw - trzy skupiska punktów, w stosunku 1:5:20. Liczba punktów wynosi 20864:

df1 <- as.data.frame(cbind(TestCompanies4[1:1000,1]-6,TestCompanies4[1:1000,2]+2))
df3 <- as.data.frame(cbind(TestCompanies4[1001:1200,1]-3,TestCompanies4[1001:1200,2]-1))
df2 <- TestCompanies4[1201:nrow(TestCompanies4),c(1,2)]
names(df1) <- names(df2)
names(df3) <- names(df2)
TestCompanies10 <- rbind(df1,df2,df3)

ggplot() +
  geom_polygon(data=mapDF, aes(long, lat, group=group), colour='#808080', fill=NA) +
  geom_point(data=TestCompanies10, aes(coords.x1,coords.x2)) +
  theme_nothing() +
  labs(long="longitude", lat="latitude")

Testy:

listaTestow <- list(TestCompanies1,TestCompanies2,TestCompanies3,TestCompanies4,TestCompanies5,
                    TestCompanies6,TestCompanies7,TestCompanies8,TestCompanies9,TestCompanies10)

listaWolumenów <- list(100,200,500,1000)


for (numer in 1:(length(listaTestow))){
  print(paste("Iteracja:", numer))
  theoreticalDist <- spsample(region, nrow(listaTestow[[numer]]), type="regular", offset=c(0,0))
  theoreticalDist <- mean(dist(theoreticalDist@coords))
  print(paste("Odleg³oœæ teoretyczna:", theoreticalDist))

  for (wolumen in 1:length(listaWolumenów)){
    chwilowaRamka <- data.frame("nazwa" = numeric(0))

    for(i in 1:1000){
      chwilowaRamka <- rbind(chwilowaRamka, calcDistanceIndex(listaTestow[[numer]], region, theoreticalDist, listaWolumenów[[wolumen]]))
    }

    assign(paste('test', numer,'FULL',listaWolumenów[[wolumen]], sep=''), chwilowaRamka )
    nazwa <- paste('test', numer,'FULL',listaWolumenów[[wolumen]], sep='')
    doZapisu <- paste0("save(",nazwa,", file='C:/Users/Max/Desktop/TestEmpiryczny/", nazwa, ".rda' )")
    eval(parse(text=doZapisu))
  }
}

WynikiDokladneIndeksuSPAG <- data.frame("nazwa" = numeric(0))
for (numer in 1:length(listaTestow)){
  print(paste("Iteracja:", numer))
  theoreticalDist <- spsample(region, nrow(listaTestow[[numer]]), type="regular", offset=c(0,0))
  theoreticalDist <- mean(dist(theoreticalDist@coords))
  print(paste("Odleg³oœæ teoretyczna:", theoreticalDist))
  nowaWart <- calcDistanceIndex(listaTestow[[numer]], region, theoreticalDist, nrow(listaTestow[[numer]]))
  WynikiDokladneIndeksuSPAG <- rbind(WynikiDokladneIndeksuSPAG, nowaWart)
  print(paste("Odleg³oœæ empiryczna:", nowaWart))
  save(WynikiDokladneIndeksuSPAG, file='C:/Users/Max/Desktop/TestEmpiryczny/WynikiDokladneIndeksuSPAG.rda')

}

Wyniki:

library(dplyr)
library(tidyr)
setwd("C:/Users/Max/Desktop/TestEmpiryczny")
load("WynikiDokladneIndeksuSPAG.rda")
load("test1FULL100.rda")
load("test1FULL200.rda")
load("test1FULL500.rda")
load("test1FULL1000.rda")

test1x <- cbind(test1FULL100,test1FULL200,test1FULL500,test1FULL1000)
names(test1x) <- c("100", "200", "500", "1000")
test1 <- test1x %>% gather(type,value)
test1$type <- factor(as.factor(test1$type),c("100", "200", "500", "1000"))

ggplot() +
  geom_boxplot(data=test1, mapping=aes(type,value)) +
   geom_hline(yintercept = WynikiDokladneIndeksuSPAG[1,], col="red")

setwd("C:/Users/Max/Desktop/TestEmpiryczny")
load("test2FULL100.rda")
load("test2FULL200.rda")
load("test2FULL500.rda")
load("test2FULL1000.rda")

test2x <- cbind(test2FULL100,test2FULL200,test2FULL500,test2FULL1000)
names(test2x) <- c("100", "200", "500", "1000")
test2 <- test2x %>% gather(type,value)
test2$type <- factor(as.factor(test2$type),c("100", "200", "500", "1000"))

ggplot() +
  geom_boxplot(data=test2, mapping=aes(type,value)) +
  geom_hline(yintercept = WynikiDokladneIndeksuSPAG[2,], col="red")

setwd("C:/Users/Max/Desktop/TestEmpiryczny")
load("test3FULL100.rda")
load("test3FULL200.rda")
load("test3FULL500.rda")
load("test3FULL1000.rda")

test3x <- cbind(test3FULL100,test3FULL200,test3FULL500,test3FULL1000)
names(test3x) <- c("100", "200", "500", "1000")
test3 <- test3x %>% gather(type,value)
test3$type <- factor(as.factor(test2$type),c("100", "200", "500", "1000"))

ggplot() +
  geom_boxplot(data=test3, mapping=aes(type,value)) +
  geom_hline(yintercept = WynikiDokladneIndeksuSPAG[3,], col="red")

setwd("C:/Users/Max/Desktop/TestEmpiryczny")
load("test4FULL100.rda")
load("test4FULL200.rda")
load("test4FULL500.rda")
load("test4FULL1000.rda")

test4x <- cbind(test4FULL100,test4FULL200,test4FULL500,test4FULL1000)
names(test4x) <- c("100", "200", "500", "1000")
test4 <- test4x %>% gather(type,value)
test4$type <- factor(as.factor(test4$type),c("100", "200", "500", "1000"))

ggplot() +
  geom_boxplot(data=test4, mapping=aes(type,value)) +
  geom_hline(yintercept = WynikiDokladneIndeksuSPAG[4,], col="red")

setwd("C:/Users/Max/Desktop/TestEmpiryczny")
load("test5FULL100.rda")
load("test5FULL200.rda")
load("test5FULL500.rda")
load("test5FULL1000.rda")

test5x <- cbind(test5FULL100,test5FULL200,test5FULL500,test5FULL1000)
names(test5x) <- c("100", "200", "500", "1000")
test5 <- test5x %>% gather(type,value)
test5$type <- factor(as.factor(test5$type),c("100", "200", "500", "1000"))

ggplot() +
  geom_boxplot(data=test5, mapping=aes(type,value)) +
  geom_hline(yintercept = WynikiDokladneIndeksuSPAG[5,], col="red")

setwd("C:/Users/Max/Desktop/TestEmpiryczny")
load("test6FULL100.rda")
load("test6FULL200.rda")
load("test6FULL500.rda")
load("test6FULL1000.rda")

test6x <- cbind(test6FULL100,test6FULL200,test6FULL500,test6FULL1000)
names(test6x) <- c("100", "200", "500", "1000")
test6 <- test6x %>% gather(type,value)
test6$type <- factor(as.factor(test6$type),c("100", "200", "500", "1000"))

ggplot() +
  geom_boxplot(data=test6, mapping=aes(type,value)) +
  geom_hline(yintercept = WynikiDokladneIndeksuSPAG[6,], col="red")

setwd("C:/Users/Max/Desktop/TestEmpiryczny")
load("test7FULL100.rda")
load("test7FULL200.rda")
load("test7FULL500.rda")
load("test7FULL1000.rda")

test7x <- cbind(test7FULL100,test7FULL200,test7FULL500,test7FULL1000)
names(test7x) <- c("100", "200", "500", "1000")
test7 <- test7x %>% gather(type,value)
test7$type <- factor(as.factor(test7$type),c("100", "200", "500", "1000"))

ggplot() +
  geom_boxplot(data=test7, mapping=aes(type,value)) +
  geom_hline(yintercept = WynikiDokladneIndeksuSPAG[7,], col="red")

setwd("C:/Users/Max/Desktop/TestEmpiryczny")
load("test8FULL100.rda")
load("test8FULL200.rda")
load("test8FULL500.rda")
load("test8FULL1000.rda")

test8x <- cbind(test8FULL100,test8FULL200,test8FULL500,test8FULL1000)
names(test8x) <- c("100", "200", "500", "1000")
test8 <- test8x %>% gather(type,value)
test8$type <- factor(as.factor(test4$type),c("100", "200", "500", "1000"))

ggplot() +
  geom_boxplot(data=test8, mapping=aes(type,value)) +
  geom_hline(yintercept = WynikiDokladneIndeksuSPAG[8,], col="red")

setwd("C:/Users/Max/Desktop/TestEmpiryczny")
load("test9FULL100.rda")
load("test9FULL200.rda")
load("test9FULL500.rda")
load("test9FULL1000.rda")

test9x <- cbind(test9FULL100,test9FULL200,test9FULL500,test9FULL1000)
names(test9x) <- c("100", "200", "500", "1000")
test9 <- test9x %>% gather(type,value)
test9$type <- factor(as.factor(test9$type),c("100", "200", "500", "1000"))

ggplot() +
  geom_boxplot(data=test9, mapping=aes(type,value)) +
  geom_hline(yintercept = WynikiDokladneIndeksuSPAG[9,], col="red")

setwd("C:/Users/Max/Desktop/TestEmpiryczny")
load("test10FULL100.rda")
load("test10FULL200.rda")
load("test10FULL500.rda")
load("test10FULL1000.rda")

test10x <- cbind(test10FULL100,test10FULL200,test10FULL500,test10FULL1000)
names(test10x) <- c("100", "200", "500", "1000")
test10 <- test10x %>% gather(type,value)
test10$type <- factor(as.factor(test10$type),c("100", "200", "500", "1000"))

ggplot() +
  geom_boxplot(data=test10, mapping=aes(type,value)) +
  geom_hline(yintercept = WynikiDokladneIndeksuSPAG[10,], col="red")

coefficient of variation

rbind(
  apply(test1x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}),
  apply(test2x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}),
  apply(test3x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}),
  apply(test4x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}),
  apply(test5x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}),
  apply(test6x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}),
  apply(test7x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}),
  apply(test8x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}),
  apply(test9x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}),
  apply(test10x, MARGIN=2, FUN=function(x){sd(x)/mean(x)})
)

mat <- matrix(rbind(
  apply(test1x, MARGIN=2, FUN=function(x){mean(x)}),
  apply(test2x, MARGIN=2, FUN=function(x){mean(x)}),
  apply(test3x, MARGIN=2, FUN=function(x){mean(x)}),
  apply(test4x, MARGIN=2, FUN=function(x){mean(x)}),
  apply(test5x, MARGIN=2, FUN=function(x){mean(x)}),
  apply(test6x, MARGIN=2, FUN=function(x){mean(x)}),
  apply(test7x, MARGIN=2, FUN=function(x){mean(x)}),
  apply(test8x, MARGIN=2, FUN=function(x){mean(x)}),
  apply(test9x, MARGIN=2, FUN=function(x){mean(x)}),
  apply(test10x, MARGIN=2, FUN=function(x){mean(x)})
), nrow=10)

for (i in 1:10){
  print(abs((mat[i,]-WynikiDokladneIndeksuSPAG[i,])/mat[i,]))
}