knitr::opts_chunk$set(echo = TRUE)
This is the tutorial for implementing quality control procedure using the R package "QCwind". The quality control procedure is comprehensively introduced and discussed in the master thesis (\url{https://dspace.library.uu.nl/handle/1874/398742}), while this "QCwind" package is build to apply the QC checks.
In this tutorial, we focus on the wind speed observations from citizen weather stations (CWS) located in the province of Utrecht of the Netherlands. The data is provided by Weather Observation Website (WOW-NL, \url{https://wow.knmi.nl/}).
# library(QCwind) ### data - example # x <- sample(1000) # usethis::use_data(x, mtcars) # library(mtcars) # Documenting datasets
The WOW data studied here includes wind observations of CWS (WOW stations) located in the province of Utrecht during the three years 2016-2018;
We do not consider WOW stations that do not have enough valid wind speed observations in the three years;
If a WOW station only have valid (not NA) wind speed data in less than one-year period (less than 35% data completeness), we exclude this station in further study.
# data: wow_raw_noNA_data.RData - used_station_data_each, wow_station_information_used valid_percent = function(dataframe) { # calculate the valid data percentage of each station, given that NA wind speed observations are already removed with the whole lines. require(lubridate) valid.days = length( unique( lubridate::date(dataframe$datetime) ) ) whole.3y.days = 366 + 365 + 365 valid.percent = valid.days / whole.3y.days * 100 return(valid.percent) } station.valid_percent = rep(NA,67) for (w in 1:length(used_station_data_each)) { station.valid_percent[w] = valid_percent(used_station_data_each[[w]]) } station.enough_data.label = setdiff( which(station.valid_percent >= 35), c(24,25) ) # remove 24,25 (knmi stations debilt and cabauw).
Note that we remove #24 and #25 station from the label, as they are KNMI stations "De Bilt" and "Cabauw".
Several CWS reports repeated wind speed values at all times, we need to identify and remove such stations;
If most (more than 90%) of the wind speed observations are of the same value, we exclude that station in further study.
# data: output of above repeat_percent = function(dataframe) { all.values = table(dataframe$windspeed_metrepersecond) repeat.percent = max(all.values) / length(dataframe$windspeed_metrepersecond) * 100 return(repeat.percent) } station.repeat_percent = rep(NA,67) for (w in 1:length(used_station_data_each)) { station.repeat_percent[w] = repeat_percent(used_station_data_each[[w]]) } station.enough_data.label = intersect( which(station.repeat_percent < 90), station.enough_data.label )
Our results show that WOW stations #29 and #39 are removed from the previous station list station.enough_data.label
.
We select a list of 39 WOW stations from raw data to be considered in the quality control study.
# data: output of above final_label = station.enough_data.label wow_information = wow_station_information_used[final_label,] wow_wind_data = used_station_data_each[final_label] # only keep wind speed data library(tidyverse) wow_windspeed_each = c() for (w in 1:length(wow_wind_data)) { wow_windspeed_each[[w]] = wow_wind_data[[w]] %>% dplyr::select(station_id = station_id, datetime = datetime, windspeed = windspeed_metrepersecond) } # save new data in wow.after_preprocess.RData
Check the historical KNMI wind speed data to determine an upper bound for WOW wind speeds;
Use the upper bound to check the plausible range of WOW wind speed observations.
# data: wow.after_preprocess.RData (KNMI reference wind speed data added) library(QCwind) library(xts) library(zoo) library(lubridate) library(tidyverse) w = 11 wow_windspeed.uniform_standardQC = c() for (w in 1:39) { wow_single = wow_windspeed_each[[w]] # 1. range check wow_single_range = range_check(wow_single, data.column = 'windspeed', datetime.column = 'datetime', upper.bound = c(33.4,29.8,29.8,29.8,26.2,23.1,25.0,26.8,35.0,31.0,30.9,32.4)) attributes(wow_single_range) # 2. step check Sys.time() # 2min per station wow_single_step = temporal_step_check.improve(wow_single_range, data.column = 'new_data_range', datetime.column = 'datetime', step.duration = 660, max.variation = 13.88) Sys.time() attributes(wow_single_step) # uniform time series datetime_sequence = seq.POSIXt(lubridate::ymd_hms('2016-01-01 00:00:00 UTC'), lubridate::ymd_hms('2019-01-01 00:00:00 UTC'), units = "seconds", by = 600) wow_single_step.uniform.xts = uniform_data(data = wow_single_step, data.column = 'new_data_step', datetime.column = 'datetime', timeseq = datetime_sequence) wow_single_step.uniform = tibble(station_id = wow_single_step$station_id[1], datetime = datetime_sequence, windspeed_uniform = as.numeric(coredata(wow_single_step.uniform.xts)) ) # 3. persist check Sys.time() wow_single_persist.normal = temporal_persist_check(wow_single_step.uniform, data.column = 'windspeed_uniform', datetime.column = 'datetime', persist.duration = 16*600, min.variation = 0.05) Sys.time() wow_single_persist.extend = temporal_persist_check(wow_single_step.uniform, data.column = 'windspeed_uniform', datetime.column = 'datetime', persist.duration = 864*600, min.variation = 0.05) Sys.time() wow_single_persist = wow_single_persist.normal %>% mutate(flag_persist_longzeros = wow_single_persist.extend$flag_persist) %>% mutate(new_data_persist_longzeros = wow_single_persist.extend$new_data_persist) wow_windspeed.uniform_standardQC[[w]] = wow_single_persist } save.image("~/Documents/wow.after_standardQC.RData") Sys.time()
############### check KNMI persistence knmi_ws_var = apply(knmi_windspeed, 2, function(x){ x - dplyr::lag(x) }) range(abs(knmi_ws_var), na.rm = TRUE) for (k in 1:47) { # print(k) ws_label = which( abs(knmi_ws_var[,k]) > 0.01) # 0.01, 0.1, 0.05 knmi_ws_var[ws_label,k] = NA # print(length(which(knmi_ws_var[,k] == 0))) } split_NA <- function( x ){ idx <- 1 + cumsum( is.na( x ) ) not.na <- ! is.na( x ) split( x[not.na], idx[not.na] ) } split_knmi_ws = apply(knmi_ws_var, 2, split_NA) test_split_knmi_official = split_knmi_ws test_split_length_knmi_official = c() for (k in 1:47) { test = test_split_knmi_official[[k]] test_l = 1:length(test) for (n in 1 : length(test) ) { test_l[n] = length(test[[n]]) } test_split_length_knmi_official[[k]] = test_l } interval_lengths = unlist(test_split_length_knmi_official) # test_split_length_knmi_official[[k]] table(interval_lengths) # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 18 19 22 # 138311 3769 241 84 41 29 12 10 9 8 4 5 5 2 1 2 1 1 1 # 24 25 26 27 32 35 37 42 43 55 58 85 119 135 136 591 830 # 1 1 1 1 1 7 1 1 1 1 1 1 1 1 1 1 1 # choose 16*10min as the persist duration ###############################
knmi_kriging_all = kriging_quantile_true(official.windspeed = knmi_windspeed, datetime.sequence = datetime_sequence, official.longitude = knmi_information$longitude, official.latitude = knmi_information$latitude, split.season = TRUE) wow_kriging_quantiles = interpolate_quantiles(kriging.reference.quantiles = knmi_kriging_all, station.id = wow_information$station_id, station.longitude = wow_information$longitude, station.latitude = wow_information$latitude) wow_windspeed.afterBC = list() for (w in 1:39) { print(Sys.time()) wow_single_beforeBC = wow_windspeed.uniform_standardQC[[w]] wow_single_afterBC = wow_single_beforeBC %>% mutate(new_data_bc = NA) wow_single_afterBC$new_data_bc = eqm_bias_correction( train.obs = wow_single_beforeBC$windspeed_before_bc, train.datetime = wow_single_beforeBC$datetime, test.obs = wow_single_beforeBC$windspeed_before_bc, test.datetime = wow_single_beforeBC$datetime, true.quantiles = wow_kriging_quantiles[[w]]) wow_windspeed.afterBC[[w]] = wow_single_afterBC }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.