knitr::opts_chunk$set(echo = FALSE, fig.height=7.5, fig.width=10,
                      fig.align = 'center')
# Load packages
library(data.table)
library(dplyr)
library(naniar)
library(lubridate)
library(ggplot2)
devtools::load_all()

Load data

Data was downloaded manually from https://dane.imgw.pl/ and I stored them in 'data-raw' folder. Here are the dimensions of obtained data frame:

# specify location
path <- './data-raw/'

# function for batch .csv upload
do.call_rbind_fread <- function(path, pattern = "*.csv") {
  files = list.files(path, pattern, full.names = TRUE)
  do.call(rbind, lapply(files, function(x) fread(x, stringsAsFactors = FALSE)))
}

x <- do.call_rbind_fread(path)
dim(x)

The data set contained over 140000 rows and 107 columns. Unfortunately tables did not contain column names, so I needed to enter them manually ( I will skip that ugly part).

colnames <- c('Kod stacji','Nazwa stacji', 'Rok', 'Miesiac', 'Dzien', 'Godzina', 'Wysokosc podstawy chmur CL CM szyfrowana','Status pomiaru HPOD', 'Wysokosc podstawy nizszej', 'Status pomiaru HPON', 'Wysokosc podstawy wyzszej', 'Status pomiaru HPOW', 'Wysokosc podstawy tekstowy', 'Pomiar przyrzadem 1 (niższa)', 'Pomiar przyrzadem 2 (wyższa)', 'Widzialnosc', 'Status pomiaru WID', 'Widzialnosc operatora', 'Status pomiaru WIDO','Widzialnosc automat', 'Status pomiaru WIDA', 'Zachmurzenie ogólne', 'Status pomiaru NOG','Kierunek wiatru', 'Status pomiaru KRWR', 'Prędkosc wiatru', 'Status pomiaru FWR ','Poryw wiatru ', 'Status pomiaru PORW ', 'Temperatura powietrza', 'Status pomiaru TEMP','Temperatura termometru zwilżonego', 'Status pomiaru TTZW', 'Wskaznik wentylacji','Wskaznik lodu', 'Cisnienie pary wodnej', 'Status pomiaru CPW', 'Wilgotnosc względna','Status pomiaru WLGW ', 'Temperatura punktu rosy', 'Status pomiaru TPTR', 'Cisnienie na pozimie stacji','Status pomiaru PPPS ', 'Cisnienie na pozimie morza', 'Status pomiaru PPPM', 'Charakterystyka tendencji', 'Wartosc tendencji',
'Status pomiaru APP', 'Opad za 6 godzin', 'Status pomiaru WO6G', 'Rodzaj opadu za 6 godzin',
'Status pomiaru ROPT', 'Pogoda biezaca', 'Pogoda ubiegla', 'Zachmurzenie niskie', 'Status pomiaru CLCM',
'Chmury CL', 'Status pomiaru CHCL', 'Chmury CL tekstem', 'Chmury CM', 'Status pomiaru CHCM',
'Chmury CM tekstem', 'Chmury CH [kod]', 'Status pomiaru CHCH', 'Chmury CH tekstem', 'Stan gruntu',
'Status pomiaru SGRN', 'Niedosyt wilgotnosci', 'Status pomiaru DEFI', 'Usłonecznienie', 'Status pomiaru USLN',
'Wystapienie rosy', 'Status pomiaru ROSW', 'Poryw maksymalny za okres WW', 'Status pomiaru PORK',
'Godzina wystapienia porywu', 'Minuta wystapienia porywu', 'Temperatura gruntu -5', 'Status pomiaru TG05',
'Temperatura gruntu -10', 'Status pomiaru TG10','Temperatura gruntu -20', 'Status pomiaru TG20',
'Temperatura gruntu -50', 'Status pomiaru TG50', 'Temperatura gruntu -100', 'Status pomiaru TG100',
'Temperatura minimalna za 12 godzin', 'Status pomiaru TMIN ', 'Temperatura maksymalna za 12 godzin', 'Status pomiaru TMAX ',
'Temperatura minimalna przy gruncie za 12 godzin', 'Status pomiaru TGMI',
'Równoważnik wodny sniegu', 'Status pomiaru RWSN', 'Wysokosć pokrywy snieżnej', 'Status pomiaru PKSN',
'Wysokosć swieżo spadłego sniegu', 'Status pomiaru HSS', 'Wysokosć sniegu na poletku', 'Status pomiaru GRSN',
'Gatunek sniegu', 'Ukształtowanie pokrywy', 'Wysokosć próbki', 'Status pomiaru HPRO',
'Ciężar próbki', 'Status pomiaru CIPR')
colnames(x) <- colnames

This is all about temperature so let's select the right columns and translate them to English.

temp <- x %>% select('Rok', 'Miesiac', 'Dzien', 'Godzina', 'Temperatura powietrza')
colnames(temp) <- c("year", "month", "day", "hour", "air_temp")

Explore data

Missing values

naniar package is nice for missing data visualization.

naniar::vis_miss(temp)

And this is great as we have a complete data set! Let's explore it further.

Temperature values distribution

temp %>% 
  ggplot(aes(x=air_temp, fill = as.factor(year)))+
  geom_density(alpha = .3)+
  facet_wrap(~year)+
  labs(fill = "year")+
  theme_light()

That's quite interesting. We see a bimodal distribution across most of the years (summer and winter? what about spring and autumn? do we still have 4 seasons?). 2012 looks like it had the most moderate temperatures as it is flat at the top. Anyway, let's split it by seasons. Let's assume that winter is 21-Dec - 20 Mar, spring 21-Mar - 20 Jun, summer 21-Jun - 20-Sept, Autumn 21-Sept - 20-Dec

temp_season <- temp %>%
  mutate(
    month = as.integer(month),
    day = as.integer(day),
    season = case_when(
      month == 12 & day >= 21 ~ "winter",
      month %in% c(1,2) ~ "winter",
      month == 3 & day < 21~ "winter",
      month == 3 & day >= 21 ~ "spring",
      month %in% c(4,5) ~ "spring",
      month == 6 & day < 21~ "spring",
      month == 6 & day >= 21 ~ "summer",
      month %in% c(7,8) ~ "summer",
      month == 9 & day < 21~ "summer",
      month == 9 & day >= 21 ~ "autumn",
      month %in% c(10,11) ~ "autumn",
      month == 12 & day < 21~ "autumn",
      TRUE ~ NA_character_),
    season = factor(season, ordered = TRUE, levels = c("spring", "summer", "autumn", "winter"))
    )
temp_season_median <- temp_season %>% 
  group_by(season, year) %>% 
  summarise(median = median(air_temp))

temp_season %>% 
  ggplot(aes(x=air_temp, color = as.factor(year)))+
  geom_density()+
  facet_wrap(~season, ncol = 1)+
  labs(color = "year")+
  theme_light()

It looks like summer is the most stable season. There is a lot of variation in other seasons across the years. How it will look like when we split it by year:

temp_season %>%
  ggplot(aes(x=air_temp, fill = as.factor(season)))+
  geom_density(alpha = .3)+
  geom_vline(data = temp_season_median,
             aes(xintercept = median),
             color = 'red')+
  facet_grid(year~season, scales = "free_x")+
  labs(fill = "year")+
  theme_light()

Red vertical line indicates the median values per season of each year.

Trend analysis (seasons)

It's difficult to see the trend on the charts above. Let's plot median values for each season each year:

temp_season_median %>% 
  ggplot(aes(x=year, y = median, color = season))+
  geom_jitter(shape = 21)+
  geom_smooth(method = "lm")+
  labs(y = "median air_temp")+
  theme_light()

Yep, the median temperature of all the seasons generally increases in time. Here is the plot with all data points for reference and a boxplot:

p1 <- temp_season %>% 
  ggplot(aes(x=year, y = air_temp, fill = season))+
  geom_jitter(shape = 21, alpha = .1, color = "black")+
  geom_smooth(aes(color = season), method = "lm")+
  theme_light()+
  guides(fill = guide_legend(ncol = 2))

p2 <- temp_season %>% 
  ggplot(aes(x=as.factor(season), y = air_temp))+
  geom_boxplot(aes(fill = as.factor(year)), outlier.shape = 21, outlier.alpha = .1)+
  theme_light()+
  #facet_wrap(~season, ncol = 4)+
  labs(y = "air_temp", x = "season", fill = "year")+
  theme(legend.position = "right")+
  guides(fill = guide_legend(ncol = 2))

weather::multiplot(p1, p2, cols=1)

Compare trends (seasons)

The way to check which season temperature is increasing faster is to compare betas (slope angle) of linear model. Here are the betas for each season:

seasons <- sort(unique(temp_season$season))
trends <- c()
for(s in seq_along(seasons)){
  #print(seasons[s])
  x <- temp_season %>%
    filter(season == seasons[s])
  #print(head(x))
  mod_lm <- lm(air_temp ~ year, data = x)
  #print(summary(mod_lm))
  #print('###########################')

  slope <- mod_lm$coefficients[which(names(mod_lm$coefficients) == "year")]
  #print(slope)
  trends[s] <- slope
}
#trends
temp_season %>%
  select(season) %>%
  distinct() %>%
  arrange(season) %>% 
  mutate(trend = trends) %>% 
  arrange(desc(trends)) %>%
  mutate(season = factor(season, ordered = TRUE, levels = season)) %>%
  ggplot(aes(x = season, y = trend))+
  geom_col(color = "black")+
  geom_text(aes(y = trend + 0.01, label = round(trend, 3)))+
  labs(title = "Values of betas of fitted linear model per season")+
  theme_light()

Winter and spring are warming at the fastest pace. Around 0.16 and 0.15 degrees of Celcius per year. Autumn is the most stable but still warming.

By months

Let's check how it looks like when we look at the data splitted by months. For me it was quite surprising that during winter months sometimes overlap with temperatures in June, July or August. Maybe those low tempereatures in summere were temperatures during nights?

temp_season %>% 
  ggplot(aes(x=month, y=air_temp)) +
  geom_jitter(aes(fill=season), shape=21, color = "black", alpha=.6)+
  geom_smooth(color = "darkred")+
  theme_light()
temp_daytime <- temp_season %>% 
  mutate(
    day_time = case_when(
      hour %in% c(6:12) ~ "morning",
      hour %in% c(13:21) ~ "afternoon",
      TRUE ~ "night"),
    day_time = as.factor(day_time)
  ) 
temp_daytime %>% 
  filter(season %in% c("summer", "winter")) %>% 
  ggplot(aes(x=as.factor(month), y=air_temp)) +
  geom_jitter(aes(fill=day_time), shape=21, color = "black", alpha=.6)+
  geom_smooth(color = "darkred")+
  theme_light()
temp_daytime %>% 
  filter(season %in% c("summer", "winter")) %>% 
  ggplot(aes(x=as.factor(day_time), y=air_temp)) +
  geom_jitter(aes(fill=season), shape=21, color = "black", alpha=.6)+
  geom_smooth(color = "darkred")+
  geom_hline(yintercept = max(temp_daytime$air_temp[which(temp_daytime$season == "winter")]))+
  geom_hline(yintercept = min(temp_daytime$air_temp[which(temp_daytime$season == "summer")]))+
  theme_light()


michkam89/weather documentation built on Feb. 24, 2020, 1:59 a.m.