knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

library(tidyverse)
library(ggplot2)
library(plotly)
library(stats)
library(lubridate)
library(ggplot2)
library(plotly)
library(stats)
library(lubridate)
library(sqldf)
library(forecast)

graves <- readRDS("../data/graves.rds")

graves <- graves %>% distinct()

Date of birth analysis

date_birth_n <- graves %>%
  group_by(g_date_birth) %>%
  count()

date_birth_n %>%
  arrange(desc(n)) %>%
  head(n=10)

Counts at cementaries

cementy <- openPoznan::cemeteries(coords = F)

graves_name <- left_join(graves, cementy, by=c("cm_id"="ID"))

graves_name <- graves_name %>% distinct()

ggplot(graves_name, aes(as.factor(Cemetery_Name))) + 
  geom_bar(aes(y = (..count..)/sum(..count..))) +
    geom_text(aes(label = scales::percent((..count..)/sum(..count..)),
                   y= (..count..)/sum(..count..)), size=3, stat= "count", hjust = -.5) +
  xlab("Cementary name") + 
  ylab("Percent") + 
  scale_y_continuous(labels=scales::percent) +
  theme_bw()+
  coord_flip()

Paid and Graves size

grave_size <- graves %>%
  group_by(paid,g_size) %>%
  count() %>%
  filter(g_size != "", !is.na(g_size))

ggplot(grave_size, aes(x=as.numeric(g_size), y=n)) + 
  geom_col() +
  facet_wrap(~ paid) +
  xlab("Grave size") +
  ylab("number")

clean data

# oczyszczanie danych z błędnych danych 


birth <- graves %>% filter(g_date_birth != "0001-01-01", 1) %>% filter(g_date_birth != "1900-01-01", 1) %>% 
  select(g_date_birth) %>% 
  arrange(g_date_birth) %>%
  count(g_date_birth) %>% 
   mutate(g_date_birth=ymd(g_date_birth), # zamiana na datę w formacie ymd z pakietu lubridate
         g_year_birth=year(g_date_birth)) %>% distinct()

plot_ly(data = birth, x = ~g_date_birth, y = ~n, type = "bar") %>%
layout(xaxis = list(range = c("1800","2018")))

death <- graves %>%filter(g_date_death != "0001-01-01", 1) %>% 
  filter(g_date_death != "1900-01-01", 1) %>% 
  filter(g_date_death != "1945-01-01", 1) %>%
  select(g_date_death) %>% 
  count(g_date_death) %>% arrange(g_date_death) %>%
mutate(g_date_death=ymd(g_date_death), # zamiana na datę w formacie ymd z pakietu lubridate
         g_year_birth=year(g_date_death)) %>% distinct()

plot_ly(data = death, x = ~g_date_death, y = ~n, type = "bar") %>% 
  layout(xaxis = list(range = c("1800","2018")))


burial <- graves %>% filter(g_date_burial != "0001-01-01", 1) %>%
  select(g_date_burial) %>% 
  arrange(g_date_burial) %>% count(g_date_burial) %>%
mutate(g_date_burial=ymd(g_date_burial), # zamiana na datę w formacie ymd z pakietu lubridate
         g_year_birth=year(g_date_burial)) %>% distinct()

plot_ly(data = burial, x = ~g_date_burial, y = ~n, type = "bar") %>%
 layout(xaxis = list(range = c("1800","2018")))

# analiza szeregów czasowych 

datatime analysis

birth

graves_birth <- graves %>%
  filter(g_date_birth != "0001-01-01", 1) %>%
  mutate(g_date_birth=ymd(g_date_birth), # zamiana na datę w formacie ymd z pakietu lubridate
         g_year_birth=year(g_date_birth)) # wydobycie roku

graves_birth_count <- graves_birth %>%
  group_by(g_year_birth) %>%
  count()

# w roku urodzenia wystęuje wartość 1 aż 16 razy - po sprawdzeniu zbioru graves_birth okazuje się, że jedna osoba występuje tam 16 razy - czyli w zbiorze występują duplikaty

graves_birth_distinct <- graves_birth %>%
  distinct()

graves_birth_count_distinct <- graves_birth_distinct %>%
  group_by(g_year_birth) %>%
  count()

# usuwamy nieprawidłowe wartości

graves_birth_distinct <- graves_birth_distinct %>%
  filter(g_year_birth %in% c(1000:2018)) # lata urodzenia w przedziale od 1000 do 2018

ggplot(graves_birth_distinct, aes(g_year_birth)) +
  geom_bar()
ggplot(graves_birth_distinct, aes(g_year_birth)) +
  geom_bar() +
  xlim(1850,2018)

usunięcie daty 1900-01-01, która też jest brakiem danych

# usunięcie daty 1900-01-01, która też jest brakiem danych

graves_birth_distinct %>% distinct() %>%
  filter(g_date_birth != "1900-01-01") %>%
  ggplot(aes(g_year_birth)) +
  geom_bar() +
  xlim(1850,2018)

pojedyncze roczniki

# pojedyncze roczniki

graves_birth_distinct %>% distinct() %>%
  filter(g_date_birth != c("1900-01-01"),1) %>%
  ggplot(aes(g_year_birth)) +
  geom_histogram(binwidth = 1) +
  xlim(1850,2018)

death

graves_death <- graves %>% 
  filter(g_date_death != "0001-01-01", 1) %>% 
  mutate(g_date_death=ymd(g_date_death),
  g_year_death=year(g_date_death))

graves_death_counr <- graves_death %>% 
  group_by(g_year_death) %>%
  count()

graves_death_distinct <- graves_death %>%
  distinct()


graves_death_count_distinct <- graves_death_distinct %>%
  group_by(g_year_death) %>%
  count()

graves_death_distinct <- graves_death_distinct %>%
  filter(g_year_death %in% c(100:2018)) 

ggplot(graves_death_distinct, aes(g_year_death)) +
  geom_bar()
graves_death_distinct %>% distinct() %>%
  filter(g_date_death != "1900-01-01") %>%
  ggplot(aes(g_year_death)) +
  geom_bar() +
  xlim(1850,2018)
# pojedyncze roczniki

graves_death_distinct %>% distinct() %>%
  filter(g_date_death != "1900-01-01") %>%
  ggplot(aes(g_year_death)) +
  geom_histogram(binwidth = 1) +
  xlim(1900,2018)

burial

graves_burial <- graves %>% 
  filter(g_date_burial != "0001-01-01", 1) %>% 
  mutate(g_date_burial=ymd(g_date_burial),
         g_year_burial=year(g_date_burial))

graves_burial_count <- graves_burial %>% 
  group_by(g_date_burial) %>% 
  count()

graves_burial_distinct <- graves_burial %>%
  distinct()

graves_burial_count_distinct <- graves_burial_distinct %>%
  group_by(g_year_burial) %>%
  count()

graves_burial_distinct <- graves_burial_distinct %>%
  filter(g_year_burial %in% c(1100:2018)) 

ggplot(graves_burial_distinct, aes(g_date_burial)) +
  ylim(0,100)+
  geom_bar()
graves_burial_distinct %>% distinct() %>%
  filter(g_date_burial != c("1900-01-01","1945-01-01")) %>%
  ggplot(aes(g_year_burial)) +
  geom_bar() +
  xlim(1850,2018)
graves_burial_distinct %>% distinct() %>%
  filter(g_date_burial != "1900-01-01") %>%
  ggplot(aes(g_year_burial)) +
  geom_histogram(binwidth = 1) +
  xlim(1850,2018)

all birth, death, burial

graves_birth_count_distinct %>% ggplot(aes(x=g_year_birth), color = "red")+
  geom_col(data = graves_birth_count_distinct, aes(g_year_birth,y=n), color = "red")+
 geom_col(data = graves_death_count_distinct, aes(g_year_death,y=n), color = "blue")+
 geom_col(data = graves_burial_count_distinct, aes(g_year_burial,y=n), color = "green")+
  ylim(0,2000)+
  xlim(1750,2018)
datatime <- data2 %>% mutate(year = year(g_date_death)) %>% 
  ggplot(aes(x = year, y = n)) +
  geom_col()+
  xlim(1910,1921)
datatime2 <- data2 %>% mutate(year = year(g_date_death)) %>% 
  ggplot(aes(x = year, y = n)) +
  geom_col()+
  xlim(1920,1931)
datatime3 <- data2 %>% mutate(year = year(g_date_death)) %>% 
  ggplot(aes(x = year, y = n)) +
  geom_col()+
  xlim(1930,1941)
datatime
datatime2
datatime3

# miłostowo
birth_cm_2 <- graves_birth %>% select(c(cm_id,g_year_birth)) %>%
  count(g_year_birth,cm_id) 

birth_clean_cm_2 <- birth_cm_2 %>% 
  filter(cm_id == 2) %>%
  filter(g_year_birth != "1900") %>%
  arrange(g_year_birth) %>% filter(g_year_birth >= "1900", g_year_birth <= "2016")

plot_ly(data = birth_clean_cm_2, x = ~g_year_birth, y = ~n, type = "scatter", mode = "lines+markers")

# junikowo

birth_cm_1 <- graves_birth %>% select(c(cm_id,g_year_birth)) %>%
  count(g_year_birth,cm_id) 

birth_clean_cm_1 <- birth_cm_1 %>% 
  filter(cm_id == 1) %>%
  filter(g_year_birth != "1900") %>%
  arrange(g_year_birth) %>% filter(g_year_birth >= "1900", g_year_birth <= "2016")

plot_ly(data = birth_clean_cm_1, x = ~g_year_birth, y = ~n, type = "scatter", mode = "lines+markers")


# jeżycki zabytkowy 

birth_cm_4 <- graves_birth %>% select(c(cm_id,g_year_birth)) %>%
  count(g_year_birth,cm_id) 

birth_clean_cm_4 <- birth_cm_4 %>% 
  filter(cm_id == 4) %>%
  filter(g_year_birth != "1900") %>%
  arrange(g_year_birth) %>% filter(g_year_birth >= "1900", g_year_birth <= "2016")

plot_ly(data = birth_clean_cm_4, x = ~g_year_birth, y = ~n, type = "scatter", mode = "lines+markers")

# górczyn

birth_cm_6 <- graves_birth %>% select(c(cm_id,g_year_birth)) %>%
  count(g_year_birth,cm_id) 

birth_clean_cm_6 <- birth_cm_6 %>% 
  filter(cm_id == 6) %>%
   filter(g_year_birth != "1900") %>%
  arrange(g_year_birth) %>% filter(g_year_birth >= "1900", g_year_birth <= "2016")

plot_ly(data = birth_clean_cm_6, x = ~g_year_birth, y = ~n, type = "scatter", mode = "lines+markers")



All_birth <-graves_birth %>% select(c(cm_id,g_year_birth)) %>%
  count(g_year_birth,cm_id) 

#all

All_birth_clean <- birth_cm_2 %>% 
  filter(cm_id == c("1","2","4","6")) %>%
  filter(g_year_birth != "1900") %>%
  arrange(g_year_birth) %>% filter(g_year_birth >= "1900", g_year_birth <= "2016")

ggplot(All_birth_clean, aes(x=as.numeric(g_year_birth), y=n)) + 
  geom_line() +
  facet_wrap(~ g_year_birth) +
  xlab("") +
  ylab("")
ggplot(data = birth_clean_cm_2, aes(x=g_year_birth,y=n), color = "red")+
  geom_col(data = birth_clean_cm_2, aes(g_year_birth,y=n), color = "red")+
 geom_col(data = birth_clean_cm_4, aes(g_year_birth,y=n), color = "blue")+
 geom_col(data = birth_clean_cm_6, aes(g_year_birth,y=n), color = "green")+
  geom_col(data = birth_clean_cm_1, aes(g_year_birth,y=n), color = "yellow")+
  facet_wrap(~n)+
  ylim(0,2000)+
  xlim(1900,2016)

datatime analysis cemetery no 4 "Zabytkowy Jeżycki"

choice_cm_4 <- graves_death %>% select(c(cm_id,g_year_death)) %>%
  count(g_year_death,cm_id) 

clean_cm_4 <- choice_cm_4 %>% 
  filter(cm_id == 4) %>%
  arrange(g_year_death) %>% filter(g_year_death >= "1999", g_year_death <= "2016")


plot_ly(data = clean_cm_4, x = ~g_year_death, y = ~n, type = "scatter", mode = "lines+markers")

plot(clean_cm_4$n, main = "zgony")

boxplot(clean_cm_4$n)

mean_4 <- mean(clean_cm_4$n)
stat_4 <- t(summary(clean_cm_4$n))
sd_4 <- sd(clean_cm_4$n)
sd_4
var_4 <- var(clean_cm_4$n)
var_4

statystic_4 <- cbind(sd_4,var_4,stat_4)
statystic_4

acf(clean_cm_4$n)
pacf(clean_cm_4$n)
frequency(clean_cm_4$n)
library(forecast)
graves.prediction_4 <- ets(clean_cm_4$n)
graves.prediction.forecast_4 <- forecast(graves.prediction_4)
plot(graves.prediction_4)
plot(graves.prediction.forecast_4)


log <- (log(clean_cm_4$n))
plot(log)
plot(clean_cm_4$n)

datatime analysis cemetery no 6 "Górczyn"

choice_cm_6 <- graves_death %>% select(c(cm_id,g_year_death)) %>%
  count(g_year_death,cm_id) 

clean_cm_6 <- choice_cm_6 %>% 
  filter(cm_id == 6) %>%
  arrange(g_year_death) %>% filter(g_year_death >= "1999", g_year_death <= "2016")

plot_ly(data = clean_cm_6, x = ~g_year_death, y = ~n, type = "scatter", mode = "lines+markers")

plot(clean_cm_6$n, main = "zgony")

boxplot(clean_cm_6$n)

mean_6 <- mean(clean_cm_6$n)
stat_6 <- t(summary(clean_cm_6$n))
sd_6 <- sd(clean_cm_6$n)
sd_6
var_6 <- var(clean_cm_6$n)
var_6

statystic_6 <- cbind(sd_6,var_6,stat_6)
statystic_6

acf(clean_cm_6$n)
pacf(clean_cm_6$n)
frequency(clean_cm_6$n)
graves.prediction_6 <- ets(clean_cm_6$n)
graves.prediction.forecast_6 <- forecast(graves.prediction_6)
plot(graves.prediction_6)
plot(graves.prediction.forecast_6)



log_6 <- (log(clean_cm_6$n))
plot(log)
plot(clean_cm_6$n)

fall birthday on account of I World War and II World War

choice_cm_2 <- graves_birth %>% select(c(cm_id,g_year_birth)) %>%
  count(g_year_birth,cm_id) 

clean_cm_2 <- choice_cm_2 %>% 
  filter(cm_id == 2) %>%
  arrange(g_year_birth) %>% filter(g_year_birth >= "1920", g_year_birth <= "1950")

plot_ly(data = clean_cm_2, x = ~g_year_birth, y = ~n, type = "scatter", mode = "lines+markers")


lwawrowski/openPoznan documentation built on July 6, 2019, 4:48 p.m.