knitr::opts_chunk$set(echo = TRUE)

This is the tutorial for implementing quality control procedure using the R package "QCwind". The quality control procedure is comprehensively introduced and discussed in the master thesis (\url{https://dspace.library.uu.nl/handle/1874/398742}), while this "QCwind" package is build to apply the QC checks.

In this tutorial, we focus on the wind speed observations from citizen weather stations (CWS) located in the province of Utrecht of the Netherlands. The data is provided by Weather Observation Website (WOW-NL, \url{https://wow.knmi.nl/}).

# library(QCwind)
### data - example
# x <- sample(1000)
# usethis::use_data(x, mtcars)
# library(mtcars)
# Documenting datasets

Pre-processing

Data completeness check of each WOW station

# data: wow_raw_noNA_data.RData - used_station_data_each, wow_station_information_used

valid_percent = function(dataframe) {
  # calculate the valid data percentage of each station, given that NA wind speed observations are already removed with the whole lines.
  require(lubridate)
  valid.days = length( unique( lubridate::date(dataframe$datetime) ) )
  whole.3y.days = 366 + 365 + 365
  valid.percent = valid.days / whole.3y.days * 100
  return(valid.percent)
}

station.valid_percent = rep(NA,67)
for (w in 1:length(used_station_data_each)) {
  station.valid_percent[w] = valid_percent(used_station_data_each[[w]])
}
station.enough_data.label = setdiff( which(station.valid_percent >= 35), c(24,25) ) # remove 24,25 (knmi stations debilt and cabauw).

Note that we remove #24 and #25 station from the label, as they are KNMI stations "De Bilt" and "Cabauw".

Data duplicate check

# data: output of above

repeat_percent = function(dataframe) {
  all.values = table(dataframe$windspeed_metrepersecond)
  repeat.percent = max(all.values) / length(dataframe$windspeed_metrepersecond) * 100
  return(repeat.percent)
}

station.repeat_percent = rep(NA,67)
for (w in 1:length(used_station_data_each)) {
  station.repeat_percent[w] = repeat_percent(used_station_data_each[[w]])
}
station.enough_data.label = intersect( which(station.repeat_percent < 90), station.enough_data.label )

Our results show that WOW stations #29 and #39 are removed from the previous station list station.enough_data.label.

Finishing pre-processing

We select a list of 39 WOW stations from raw data to be considered in the quality control study.

# data: output of above
final_label = station.enough_data.label
wow_information = wow_station_information_used[final_label,]
wow_wind_data = used_station_data_each[final_label]

# only keep wind speed data
library(tidyverse)
wow_windspeed_each = c()
for (w in 1:length(wow_wind_data)) {
  wow_windspeed_each[[w]] = wow_wind_data[[w]] %>% 
    dplyr::select(station_id = station_id, datetime = datetime, windspeed = windspeed_metrepersecond)
}
# save new data in wow.after_preprocess.RData

Standard quality control

Range check

  1. Check the historical KNMI wind speed data to determine an upper bound for WOW wind speeds;

  2. Use the upper bound to check the plausible range of WOW wind speed observations.

# data: wow.after_preprocess.RData (KNMI reference wind speed data added)
library(QCwind)
library(xts)
library(zoo)
library(lubridate)
library(tidyverse)

w = 11

wow_windspeed.uniform_standardQC = c()

for (w in 1:39) {

wow_single = wow_windspeed_each[[w]]

# 1. range check
wow_single_range = range_check(wow_single, 
                               data.column = 'windspeed', 
                               datetime.column = 'datetime', 
                               upper.bound = 
                                 c(33.4,29.8,29.8,29.8,26.2,23.1,25.0,26.8,35.0,31.0,30.9,32.4))
attributes(wow_single_range)


# 2. step check
Sys.time() # 2min per station
wow_single_step = temporal_step_check.improve(wow_single_range, 
                                              data.column = 'new_data_range', 
                                              datetime.column = 'datetime',
                                              step.duration = 660, 
                                              max.variation = 13.88)
Sys.time()
attributes(wow_single_step)


# uniform time series
datetime_sequence = seq.POSIXt(lubridate::ymd_hms('2016-01-01 00:00:00 UTC'),
                               lubridate::ymd_hms('2019-01-01 00:00:00 UTC'), 
                               units = "seconds", by = 600)
wow_single_step.uniform.xts = uniform_data(data = wow_single_step, 
                                           data.column = 'new_data_step', 
                                           datetime.column = 'datetime',
                                           timeseq = datetime_sequence)
wow_single_step.uniform = tibble(station_id = wow_single_step$station_id[1], 
                                 datetime = datetime_sequence,
                                 windspeed_uniform = as.numeric(coredata(wow_single_step.uniform.xts)) )                                   

# 3. persist check
Sys.time() 
wow_single_persist.normal = temporal_persist_check(wow_single_step.uniform, 
                                                   data.column = 'windspeed_uniform', 
                                                   datetime.column = 'datetime',
                                                   persist.duration = 16*600, min.variation = 0.05)
Sys.time()
wow_single_persist.extend = temporal_persist_check(wow_single_step.uniform, 
                                                   data.column = 'windspeed_uniform', 
                                                   datetime.column = 'datetime',
                                                   persist.duration = 864*600, min.variation = 0.05)
Sys.time()

wow_single_persist = wow_single_persist.normal %>%
  mutate(flag_persist_longzeros = wow_single_persist.extend$flag_persist) %>%
  mutate(new_data_persist_longzeros = wow_single_persist.extend$new_data_persist)
wow_windspeed.uniform_standardQC[[w]] = wow_single_persist

}

save.image("~/Documents/wow.after_standardQC.RData")
Sys.time()
############### check KNMI persistence
knmi_ws_var = apply(knmi_windspeed, 2, function(x){
  x - dplyr::lag(x)
})
range(abs(knmi_ws_var), na.rm = TRUE)
for (k in 1:47) {
  # print(k)
  ws_label = which( abs(knmi_ws_var[,k]) > 0.01) # 0.01, 0.1, 0.05
  knmi_ws_var[ws_label,k] = NA
  # print(length(which(knmi_ws_var[,k] == 0)))
}
split_NA <- function( x ){
  idx <- 1 + cumsum( is.na( x ) )
  not.na <- ! is.na( x )
  split( x[not.na], idx[not.na] )
}
split_knmi_ws = apply(knmi_ws_var, 2, split_NA)

test_split_knmi_official = split_knmi_ws
test_split_length_knmi_official = c()
for (k in 1:47) {
  test = test_split_knmi_official[[k]]
  test_l = 1:length(test)
  for (n in 1 : length(test) ) {
    test_l[n] = length(test[[n]])
  }
  test_split_length_knmi_official[[k]] = test_l
}
interval_lengths = unlist(test_split_length_knmi_official) # test_split_length_knmi_official[[k]]
table(interval_lengths)
#      1      2      3      4      5      6      7      8      9     10     11     12     13     14     15     16     18     19     22 
# 138311   3769    241     84     41     29     12     10      9      8      4      5      5      2      1      2      1      1      1 
# 24     25     26     27     32     35     37     42     43     55     58     85    119    135    136    591    830 
#  1      1      1      1      1      7      1      1      1      1      1      1      1      1      1      1      1 
# choose 16*10min as the persist duration
###############################

Bias correction

knmi_kriging_all = kriging_quantile_true(official.windspeed = knmi_windspeed,
                                         datetime.sequence = datetime_sequence,
                                         official.longitude = knmi_information$longitude,
                                         official.latitude = knmi_information$latitude, 
                                         split.season = TRUE)

wow_kriging_quantiles = interpolate_quantiles(kriging.reference.quantiles = knmi_kriging_all,
                                              station.id = wow_information$station_id,
                                              station.longitude = wow_information$longitude,
                                              station.latitude = wow_information$latitude)

wow_windspeed.afterBC = list()
for (w in 1:39) {
  print(Sys.time())
  wow_single_beforeBC = wow_windspeed.uniform_standardQC[[w]]
  wow_single_afterBC = wow_single_beforeBC %>%
    mutate(new_data_bc = NA)
  wow_single_afterBC$new_data_bc = eqm_bias_correction(
    train.obs = wow_single_beforeBC$windspeed_before_bc,
    train.datetime = wow_single_beforeBC$datetime,
    test.obs = wow_single_beforeBC$windspeed_before_bc,
    test.datetime = wow_single_beforeBC$datetime,
    true.quantiles = wow_kriging_quantiles[[w]])
  wow_windspeed.afterBC[[w]] = wow_single_afterBC
}


jieyu97/QCwind documentation built on June 18, 2021, 3:37 a.m.