title: "Distance Index Analysis (vignette)"
date: "r Sys.Date()
"
output:
rmarkdown::html_vignette:
toc: true
vignette: >
%\VignetteIndexEntry{SPAG tutorial}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteDepends{Cairo}
%\VignetteEncoding{UTF-8}
\usepackage[utf8]{inputenc}
knitr::opts_chunk$set(echo = TRUE)
Distance Index, bêd¹cy czêœci¹ wyliczanego indeksu SPAG jest wyliczany poprzez wyznaczenie stosunku œredniej odleg³oœci pomiêdzy firmami do œredniej odleg³oœci pomiêdzy firmami w przypadku gdy by³yby roz³o¿one w sposób jednorodny na obszarze. Taki sposób zdefiniowania tego indeksu sprawia pewne trudnoœci - w algorytmie korzysta siê ze œredniej odleg³oœci pomiêdzy firmami, który ma z³o¿onoœæ pamiêciow¹ wynosz¹c¹ $O(n^2)$, co sprawia, ¿e nawet dla niewielkiej liczby firm wyznaczenie wartoœci tego indeksu mo¿e okazaæ siê zbyt czasoch³onne dla przeciêtnego komputera. Problem z czasem wyliczania indeksu wydaje siê byæ na tyle powa¿ny, ¿e niezbêdne jest opracowanie bardziej wydajnego sposobu jego otrzymywanie. W poni¿szym opracowaniu przedstawiona jest analiza otrzymywanych wartoœci indeksu Distance, w przypadku, gdy jest on wyliczany jedynie na pewnym podzbiorze firm.
Na potrzeby dalszych analiz funkcja wyliczaj¹ca indeks Distance zosta³a uproszczona, tak aby odleg³oœæ w przypadku jednorodnego rozmieszczenia firm by³a podawana jako argument funkcji.
calcDistanceIndex <- function(coordsCategoryDF, region, theoreticalMean, empiricalCompaniesSample){ currentCategory <- coordsCategoryDF[,c(1,2)] k = nrow(currentCategory) liczbaPraw <- min(empiricalCompaniesSample,k) indeksy<-sample(1:k, liczbaPraw, replace = FALSE) currentCategoryFinal <- currentCategory[as.vector(indeksy),] IDist <- mean(dist(currentCategoryFinal))/theoreticalMean return(IDist) }
Na potrzeby testów wykorzystany zosta³ zbiór ok 37 tysiêcy firm znajduj¹cych siê w województwie lubelskim, a tak¿e mapa województwa Lubelskiego. Testy zosta³y przeprowadzone na podzbiorach ró¿nej licznoœci.
library(tidyr) library(SPAG) setwd("C:/Users/Max/Desktop/TestEmpiryczny") dane<-read.csv("geoloc data.csv", header=TRUE, sep=";", dec=".") ShapefileDF<-as.data.frame(ShapefilePoland) region<-ShapefilePoland#[ShapefileDF$jpt_nazwa_=="lubelskie",] newCoordinateSystem<-"+proj=longlat +datum=WGS84" region<-spTransform(region, CRS(newCoordinateSystem)) mapDF <- fortify(region)
Pierwszy zestaw testowy - zbiór punktóW w jednym miejscu:
TestCompanies1 <- dane[dane$SEK_PKD7 %in% c("C"),c(23,24)] # Kategoria C 1327 firm TestCompanies2 <- dane[dane$SEK_PKD7 %in% c("C", "M", "Q","S"),c(23,24)] TestCompanies3 <- dane[dane$SEK_PKD7 %in% c("C", "M", "Q","S", "G", "O"),c(23,24)] TestCompanies4 <- dane[dane$SEK_PKD7 %in% c("A"),c(23,24)] cat(paste("LiczebnoϾ zbioru 1:", nrow(TestCompanies1), "\nLiczebnoϾ zbioru 2:",nrow(TestCompanies2), "\nLiczebnoϾ zbioru 3:",nrow(TestCompanies3),"\nLiczebnoϾ zbioru 4:",nrow(TestCompanies4)))
Drugi zestaw testowy - dwa skupiska punktów, w stosunku 1:5
df1 <- as.data.frame(cbind(TestCompanies1[1:275,1]-6,TestCompanies1[1:275,2]+2)) df2 <- TestCompanies1[276:nrow(TestCompanies1),c(1,2)] names(df1) <- names(df2) TestCompanies5 <- rbind(df1,df2) ggplot() + geom_polygon(data=mapDF, aes(long, lat, group=group), colour='#808080', fill=NA) + geom_point(data=TestCompanies5, aes(coords.x1,coords.x2)) + theme_nothing() + labs(long="longitude", lat="latitude")
Trzeci zestaw testowy - dwa skupiska punktów, w stosunku 1:5. Liczba punktów wynosi 5010.
df1 <- as.data.frame(cbind(TestCompanies2[1:1002,1]-6,TestCompanies2[1:1002,2]+2)) df2 <- TestCompanies2[1003:nrow(TestCompanies2),c(1,2)] names(df1) <- names(df2) TestCompanies6 <- rbind(df1,df2) ggplot() + geom_polygon(data=mapDF, aes(long, lat, group=group), colour='#808080', fill=NA) + geom_point(data=TestCompanies6, aes(coords.x1,coords.x2)) + theme_nothing() + labs(long="longitude", lat="latitude")
Czwarty zestaw testowy - dwa skupiska punktów, w stosunku 1:5. Liczba punktów wynosi 10016
df1 <- as.data.frame(cbind(TestCompanies3[1:2003,1]-6,TestCompanies3[1:2003,2]+2)) df2 <- TestCompanies3[2004:nrow(TestCompanies3),c(1,2)] names(df1) <- names(df2) TestCompanies7 <- rbind(df1,df2) ggplot() + geom_polygon(data=mapDF, aes(long, lat, group=group), colour='#808080', fill=NA) + geom_point(data=TestCompanies7, aes(coords.x1,coords.x2)) + theme_nothing() + labs(long="longitude", lat="latitude")
Pi¹ty zestaw testowy - dwa skupiska punktów, w stosunku 1:5. Liczba punktów wynosi 20864
df1 <- as.data.frame(cbind(TestCompanies4[1:4173,1]-6,TestCompanies4[1:4173,2]+2)) df2 <- TestCompanies4[4174:nrow(TestCompanies4),c(1,2)] names(df1) <- names(df2) TestCompanies8 <- rbind(df1,df2) ggplot() + geom_polygon(data=mapDF, aes(long, lat, group=group), colour='#808080', fill=NA) + geom_point(data=TestCompanies8, aes(coords.x1,coords.x2)) + theme_nothing() + labs(long="longitude", lat="latitude")
Szósty zestaw testowy - dwa skupiska punktów, w stosunku 1:20. Liczba punktów wynosi 20864:
df1 <- as.data.frame(cbind(TestCompanies4[1:1000,1]-6,TestCompanies4[1:1000,2]+2)) df2 <- TestCompanies4[1001:nrow(TestCompanies4),c(1,2)] names(df1) <- names(df2) TestCompanies9 <- rbind(df1,df2) ggplot() + geom_polygon(data=mapDF, aes(long, lat, group=group), colour='#808080', fill=NA) + geom_point(data=TestCompanies9, aes(coords.x1,coords.x2)) + theme_nothing() + labs(long="longitude", lat="latitude")
Siódmy zestaw - trzy skupiska punktów, w stosunku 1:5:20. Liczba punktów wynosi 20864:
df1 <- as.data.frame(cbind(TestCompanies4[1:1000,1]-6,TestCompanies4[1:1000,2]+2)) df3 <- as.data.frame(cbind(TestCompanies4[1001:1200,1]-3,TestCompanies4[1001:1200,2]-1)) df2 <- TestCompanies4[1201:nrow(TestCompanies4),c(1,2)] names(df1) <- names(df2) names(df3) <- names(df2) TestCompanies10 <- rbind(df1,df2,df3) ggplot() + geom_polygon(data=mapDF, aes(long, lat, group=group), colour='#808080', fill=NA) + geom_point(data=TestCompanies10, aes(coords.x1,coords.x2)) + theme_nothing() + labs(long="longitude", lat="latitude")
listaTestow <- list(TestCompanies1,TestCompanies2,TestCompanies3,TestCompanies4,TestCompanies5, TestCompanies6,TestCompanies7,TestCompanies8,TestCompanies9,TestCompanies10) listaWolumenów <- list(100,200,500,1000) for (numer in 1:(length(listaTestow))){ print(paste("Iteracja:", numer)) theoreticalDist <- spsample(region, nrow(listaTestow[[numer]]), type="regular", offset=c(0,0)) theoreticalDist <- mean(dist(theoreticalDist@coords)) print(paste("Odleg³oœæ teoretyczna:", theoreticalDist)) for (wolumen in 1:length(listaWolumenów)){ chwilowaRamka <- data.frame("nazwa" = numeric(0)) for(i in 1:1000){ chwilowaRamka <- rbind(chwilowaRamka, calcDistanceIndex(listaTestow[[numer]], region, theoreticalDist, listaWolumenów[[wolumen]])) } assign(paste('test', numer,'FULL',listaWolumenów[[wolumen]], sep=''), chwilowaRamka ) nazwa <- paste('test', numer,'FULL',listaWolumenów[[wolumen]], sep='') doZapisu <- paste0("save(",nazwa,", file='C:/Users/Max/Desktop/TestEmpiryczny/", nazwa, ".rda' )") eval(parse(text=doZapisu)) } }
WynikiDokladneIndeksuSPAG <- data.frame("nazwa" = numeric(0)) for (numer in 1:length(listaTestow)){ print(paste("Iteracja:", numer)) theoreticalDist <- spsample(region, nrow(listaTestow[[numer]]), type="regular", offset=c(0,0)) theoreticalDist <- mean(dist(theoreticalDist@coords)) print(paste("Odleg³oœæ teoretyczna:", theoreticalDist)) nowaWart <- calcDistanceIndex(listaTestow[[numer]], region, theoreticalDist, nrow(listaTestow[[numer]])) WynikiDokladneIndeksuSPAG <- rbind(WynikiDokladneIndeksuSPAG, nowaWart) print(paste("Odleg³oœæ empiryczna:", nowaWart)) save(WynikiDokladneIndeksuSPAG, file='C:/Users/Max/Desktop/TestEmpiryczny/WynikiDokladneIndeksuSPAG.rda') }
Wyniki:
library(dplyr) library(tidyr) setwd("C:/Users/Max/Desktop/TestEmpiryczny") load("WynikiDokladneIndeksuSPAG.rda") load("test1FULL100.rda") load("test1FULL200.rda") load("test1FULL500.rda") load("test1FULL1000.rda") test1x <- cbind(test1FULL100,test1FULL200,test1FULL500,test1FULL1000) names(test1x) <- c("100", "200", "500", "1000") test1 <- test1x %>% gather(type,value) test1$type <- factor(as.factor(test1$type),c("100", "200", "500", "1000")) ggplot() + geom_boxplot(data=test1, mapping=aes(type,value)) + geom_hline(yintercept = WynikiDokladneIndeksuSPAG[1,], col="red")
setwd("C:/Users/Max/Desktop/TestEmpiryczny") load("test2FULL100.rda") load("test2FULL200.rda") load("test2FULL500.rda") load("test2FULL1000.rda") test2x <- cbind(test2FULL100,test2FULL200,test2FULL500,test2FULL1000) names(test2x) <- c("100", "200", "500", "1000") test2 <- test2x %>% gather(type,value) test2$type <- factor(as.factor(test2$type),c("100", "200", "500", "1000")) ggplot() + geom_boxplot(data=test2, mapping=aes(type,value)) + geom_hline(yintercept = WynikiDokladneIndeksuSPAG[2,], col="red")
setwd("C:/Users/Max/Desktop/TestEmpiryczny") load("test3FULL100.rda") load("test3FULL200.rda") load("test3FULL500.rda") load("test3FULL1000.rda") test3x <- cbind(test3FULL100,test3FULL200,test3FULL500,test3FULL1000) names(test3x) <- c("100", "200", "500", "1000") test3 <- test3x %>% gather(type,value) test3$type <- factor(as.factor(test2$type),c("100", "200", "500", "1000")) ggplot() + geom_boxplot(data=test3, mapping=aes(type,value)) + geom_hline(yintercept = WynikiDokladneIndeksuSPAG[3,], col="red")
setwd("C:/Users/Max/Desktop/TestEmpiryczny") load("test4FULL100.rda") load("test4FULL200.rda") load("test4FULL500.rda") load("test4FULL1000.rda") test4x <- cbind(test4FULL100,test4FULL200,test4FULL500,test4FULL1000) names(test4x) <- c("100", "200", "500", "1000") test4 <- test4x %>% gather(type,value) test4$type <- factor(as.factor(test4$type),c("100", "200", "500", "1000")) ggplot() + geom_boxplot(data=test4, mapping=aes(type,value)) + geom_hline(yintercept = WynikiDokladneIndeksuSPAG[4,], col="red")
setwd("C:/Users/Max/Desktop/TestEmpiryczny") load("test5FULL100.rda") load("test5FULL200.rda") load("test5FULL500.rda") load("test5FULL1000.rda") test5x <- cbind(test5FULL100,test5FULL200,test5FULL500,test5FULL1000) names(test5x) <- c("100", "200", "500", "1000") test5 <- test5x %>% gather(type,value) test5$type <- factor(as.factor(test5$type),c("100", "200", "500", "1000")) ggplot() + geom_boxplot(data=test5, mapping=aes(type,value)) + geom_hline(yintercept = WynikiDokladneIndeksuSPAG[5,], col="red")
setwd("C:/Users/Max/Desktop/TestEmpiryczny") load("test6FULL100.rda") load("test6FULL200.rda") load("test6FULL500.rda") load("test6FULL1000.rda") test6x <- cbind(test6FULL100,test6FULL200,test6FULL500,test6FULL1000) names(test6x) <- c("100", "200", "500", "1000") test6 <- test6x %>% gather(type,value) test6$type <- factor(as.factor(test6$type),c("100", "200", "500", "1000")) ggplot() + geom_boxplot(data=test6, mapping=aes(type,value)) + geom_hline(yintercept = WynikiDokladneIndeksuSPAG[6,], col="red")
setwd("C:/Users/Max/Desktop/TestEmpiryczny") load("test7FULL100.rda") load("test7FULL200.rda") load("test7FULL500.rda") load("test7FULL1000.rda") test7x <- cbind(test7FULL100,test7FULL200,test7FULL500,test7FULL1000) names(test7x) <- c("100", "200", "500", "1000") test7 <- test7x %>% gather(type,value) test7$type <- factor(as.factor(test7$type),c("100", "200", "500", "1000")) ggplot() + geom_boxplot(data=test7, mapping=aes(type,value)) + geom_hline(yintercept = WynikiDokladneIndeksuSPAG[7,], col="red")
setwd("C:/Users/Max/Desktop/TestEmpiryczny") load("test8FULL100.rda") load("test8FULL200.rda") load("test8FULL500.rda") load("test8FULL1000.rda") test8x <- cbind(test8FULL100,test8FULL200,test8FULL500,test8FULL1000) names(test8x) <- c("100", "200", "500", "1000") test8 <- test8x %>% gather(type,value) test8$type <- factor(as.factor(test4$type),c("100", "200", "500", "1000")) ggplot() + geom_boxplot(data=test8, mapping=aes(type,value)) + geom_hline(yintercept = WynikiDokladneIndeksuSPAG[8,], col="red")
setwd("C:/Users/Max/Desktop/TestEmpiryczny") load("test9FULL100.rda") load("test9FULL200.rda") load("test9FULL500.rda") load("test9FULL1000.rda") test9x <- cbind(test9FULL100,test9FULL200,test9FULL500,test9FULL1000) names(test9x) <- c("100", "200", "500", "1000") test9 <- test9x %>% gather(type,value) test9$type <- factor(as.factor(test9$type),c("100", "200", "500", "1000")) ggplot() + geom_boxplot(data=test9, mapping=aes(type,value)) + geom_hline(yintercept = WynikiDokladneIndeksuSPAG[9,], col="red")
setwd("C:/Users/Max/Desktop/TestEmpiryczny") load("test10FULL100.rda") load("test10FULL200.rda") load("test10FULL500.rda") load("test10FULL1000.rda") test10x <- cbind(test10FULL100,test10FULL200,test10FULL500,test10FULL1000) names(test10x) <- c("100", "200", "500", "1000") test10 <- test10x %>% gather(type,value) test10$type <- factor(as.factor(test10$type),c("100", "200", "500", "1000")) ggplot() + geom_boxplot(data=test10, mapping=aes(type,value)) + geom_hline(yintercept = WynikiDokladneIndeksuSPAG[10,], col="red")
rbind( apply(test1x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}), apply(test2x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}), apply(test3x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}), apply(test4x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}), apply(test5x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}), apply(test6x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}), apply(test7x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}), apply(test8x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}), apply(test9x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}), apply(test10x, MARGIN=2, FUN=function(x){sd(x)/mean(x)}) )
mat <- matrix(rbind( apply(test1x, MARGIN=2, FUN=function(x){mean(x)}), apply(test2x, MARGIN=2, FUN=function(x){mean(x)}), apply(test3x, MARGIN=2, FUN=function(x){mean(x)}), apply(test4x, MARGIN=2, FUN=function(x){mean(x)}), apply(test5x, MARGIN=2, FUN=function(x){mean(x)}), apply(test6x, MARGIN=2, FUN=function(x){mean(x)}), apply(test7x, MARGIN=2, FUN=function(x){mean(x)}), apply(test8x, MARGIN=2, FUN=function(x){mean(x)}), apply(test9x, MARGIN=2, FUN=function(x){mean(x)}), apply(test10x, MARGIN=2, FUN=function(x){mean(x)}) ), nrow=10) for (i in 1:10){ print(abs((mat[i,]-WynikiDokladneIndeksuSPAG[i,])/mat[i,])) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.