In jieyu97/QCwind: Quality control procedures for citizen science wind observations

knitr::opts_chunk$set(echo = TRUE)

This is the tutorial for implementing quality control procedure using the R package "QCwind". The quality control procedure is comprehensively introduced and discussed in the master thesis (\url{https://dspace.library.uu.nl/handle/1874/398742}), while this "QCwind" package is build to apply the QC checks.

In this tutorial, we focus on the wind speed observations from citizen weather stations (CWS) located in the province of Utrecht of the Netherlands. The data is provided by Weather Observation Website (WOW-NL, \url{https://wow.knmi.nl/}).

# library(QCwind)
### data - example
# x <- sample(1000)
# usethis::use_data(x, mtcars)
# library(mtcars)
# Documenting datasets

Pre-processing

Data completeness check of each WOW station

The WOW data studied here includes wind observations of CWS (WOW stations) located in the province of Utrecht during the three years 2016-2018;
We do not consider WOW stations that do not have enough valid wind speed observations in the three years;
If a WOW station only have valid (not NA) wind speed data in less than one-year period (less than 35% data completeness), we exclude this station in further study.

# data: wow_raw_noNA_data.RData - used_station_data_each, wow_station_information_used

valid_percent = function(dataframe) {
  # calculate the valid data percentage of each station, given that NA wind speed observations are already removed with the whole lines.
  require(lubridate)
  valid.days = length( unique( lubridate::date(dataframe$datetime) ) )
  whole.3y.days = 366 + 365 + 365
  valid.percent = valid.days / whole.3y.days * 100
  return(valid.percent)
}

station.valid_percent = rep(NA,67)
for (w in 1:length(used_station_data_each)) {
  station.valid_percent[w] = valid_percent(used_station_data_each[[w]])
}
station.enough_data.label = setdiff( which(station.valid_percent >= 35), c(24,25) ) # remove 24,25 (knmi stations debilt and cabauw).

Note that we remove #24 and #25 station from the label, as they are KNMI stations "De Bilt" and "Cabauw".

Data duplicate check

Several CWS reports repeated wind speed values at all times, we need to identify and remove such stations;
If most (more than 90%) of the wind speed observations are of the same value, we exclude that station in further study.

# data: output of above

repeat_percent = function(dataframe) {
  all.values = table(dataframe$windspeed_metrepersecond)
  repeat.percent = max(all.values) / length(dataframe$windspeed_metrepersecond) * 100
  return(repeat.percent)
}

station.repeat_percent = rep(NA,67)
for (w in 1:length(used_station_data_each)) {
  station.repeat_percent[w] = repeat_percent(used_station_data_each[[w]])
}
station.enough_data.label = intersect( which(station.repeat_percent < 90), station.enough_data.label )

Our results show that WOW stations #29 and #39 are removed from the previous station list station.enough_data.label.

Finishing pre-processing

We select a list of 39 WOW stations from raw data to be considered in the quality control study.

# data: output of above
final_label = station.enough_data.label
wow_information = wow_station_information_used[final_label,]
wow_wind_data = used_station_data_each[final_label]

# only keep wind speed data
library(tidyverse)
wow_windspeed_each = c()
for (w in 1:length(wow_wind_data)) {
  wow_windspeed_each[[w]] = wow_wind_data[[w]] %>% 
    dplyr::select(station_id = station_id, datetime = datetime, windspeed = windspeed_metrepersecond)
}
# save new data in wow.after_preprocess.RData

Standard quality control

Range check

Check the historical KNMI wind speed data to determine an upper bound for WOW wind speeds;
Use the upper bound to check the plausible range of WOW wind speed observations.

# data: wow.after_preprocess.RData (KNMI reference wind speed data added)
library(QCwind)
library(xts)
library(zoo)
library(lubridate)
library(tidyverse)

w = 11

wow_windspeed.uniform_standardQC = c()

for (w in 1:39) {

wow_single = wow_windspeed_each[[w]]

# 1. range check
wow_single_range = range_check(wow_single, 
                               data.column = 'windspeed', 
                               datetime.column = 'datetime', 
                               upper.bound = 
                                 c(33.4,29.8,29.8,29.8,26.2,23.1,25.0,26.8,35.0,31.0,30.9,32.4))
attributes(wow_single_range)


# 2. step check
Sys.time() # 2min per station
wow_single_step = temporal_step_check.improve(wow_single_range, 
                                              data.column = 'new_data_range', 
                                              datetime.column = 'datetime',
                                              step.duration = 660, 
                                              max.variation = 13.88)
Sys.time()
attributes(wow_single_step)


# uniform time series
datetime_sequence = seq.POSIXt(lubridate::ymd_hms('2016-01-01 00:00:00 UTC'),
                               lubridate::ymd_hms('2019-01-01 00:00:00 UTC'), 
                               units = "seconds", by = 600)
wow_single_step.uniform.xts = uniform_data(data = wow_single_step, 
                                           data.column = 'new_data_step', 
                                           datetime.column = 'datetime',
                                           timeseq = datetime_sequence)
wow_single_step.uniform = tibble(station_id = wow_single_step$station_id[1], 
                                 datetime = datetime_sequence,
                                 windspeed_uniform = as.numeric(coredata(wow_single_step.uniform.xts)) )                                   

# 3. persist check
Sys.time() 
wow_single_persist.normal = temporal_persist_check(wow_single_step.uniform, 
                                                   data.column = 'windspeed_uniform', 
                                                   datetime.column = 'datetime',
                                                   persist.duration = 16*600, min.variation = 0.05)
Sys.time()
wow_single_persist.extend = temporal_persist_check(wow_single_step.uniform, 
                                                   data.column = 'windspeed_uniform', 
                                                   datetime.column = 'datetime',
                                                   persist.duration = 864*600, min.variation = 0.05)
Sys.time()

wow_single_persist = wow_single_persist.normal %>%
  mutate(flag_persist_longzeros = wow_single_persist.extend$flag_persist) %>%
  mutate(new_data_persist_longzeros = wow_single_persist.extend$new_data_persist)
wow_windspeed.uniform_standardQC[[w]] = wow_single_persist

}

save.image("~/Documents/wow.after_standardQC.RData")
Sys.time()

############### check KNMI persistence
knmi_ws_var = apply(knmi_windspeed, 2, function(x){
  x - dplyr::lag(x)
})
range(abs(knmi_ws_var), na.rm = TRUE)
for (k in 1:47) {
  # print(k)
  ws_label = which( abs(knmi_ws_var[,k]) > 0.01) # 0.01, 0.1, 0.05
  knmi_ws_var[ws_label,k] = NA
  # print(length(which(knmi_ws_var[,k] == 0)))
}
split_NA <- function( x ){
  idx <- 1 + cumsum( is.na( x ) )
  not.na <- ! is.na( x )
  split( x[not.na], idx[not.na] )
}
split_knmi_ws = apply(knmi_ws_var, 2, split_NA)

test_split_knmi_official = split_knmi_ws
test_split_length_knmi_official = c()
for (k in 1:47) {
  test = test_split_knmi_official[[k]]
  test_l = 1:length(test)
  for (n in 1 : length(test) ) {
    test_l[n] = length(test[[n]])
  }
  test_split_length_knmi_official[[k]] = test_l
}
interval_lengths = unlist(test_split_length_knmi_official) # test_split_length_knmi_official[[k]]
table(interval_lengths)
#      1      2      3      4      5      6      7      8      9     10     11     12     13     14     15     16     18     19     22 
# 138311   3769    241     84     41     29     12     10      9      8      4      5      5      2      1      2      1      1      1 
# 24     25     26     27     32     35     37     42     43     55     58     85    119    135    136    591    830 
#  1      1      1      1      1      7      1      1      1      1      1      1      1      1      1      1      1 
# choose 16*10min as the persist duration
###############################

Bias correction

knmi_kriging_all = kriging_quantile_true(official.windspeed = knmi_windspeed,
                                         datetime.sequence = datetime_sequence,
                                         official.longitude = knmi_information$longitude,
                                         official.latitude = knmi_information$latitude, 
                                         split.season = TRUE)

wow_kriging_quantiles = interpolate_quantiles(kriging.reference.quantiles = knmi_kriging_all,
                                              station.id = wow_information$station_id,
                                              station.longitude = wow_information$longitude,
                                              station.latitude = wow_information$latitude)

wow_windspeed.afterBC = list()
for (w in 1:39) {
  print(Sys.time())
  wow_single_beforeBC = wow_windspeed.uniform_standardQC[[w]]
  wow_single_afterBC = wow_single_beforeBC %>%
    mutate(new_data_bc = NA)
  wow_single_afterBC$new_data_bc = eqm_bias_correction(
    train.obs = wow_single_beforeBC$windspeed_before_bc,
    train.datetime = wow_single_beforeBC$datetime,
    test.obs = wow_single_beforeBC$windspeed_before_bc,
    test.datetime = wow_single_beforeBC$datetime,
    true.quantiles = wow_kriging_quantiles[[w]])
  wow_windspeed.afterBC[[w]] = wow_single_afterBC
}