#' Quality check of temperature values from meteorological stations
#'
#' This function allows you to clean temperature timeseries in a data.frame
#' @param df data.frame, A data.frame with a column in the Date format, and two
#' columns for daily minimum and maximum temperature respectively
#' @param max.col string, name of the column with daily maximum temperature
#' @param min.col string, name of the column with daily minimum temperature
#' @param return string, one of 'clean' or 'raw'. 'Raw' will give the dataframe that is
#' used to check for errors while 'clean' will give the dataframe with wrong values removed and a complete
#' timeseries between minimum and maximum date.
#' @param verbose logical, if true the amount of rows that are dropped for a dataframe are printed out
#' @import data.table
#' @export
#' @examples
#' check_quality(df = df.daily)
check_quality <- function(df, max.col = 'T_max', min.col = 'T_min', return = 'clean', verbose = F) {
if (!inherits(df, 'data.table')) { df <- data.table::setDT(df) }
datecol.name <- rebecka::detect_datecol(df, return = 'name', format = 'Date')
data.table::setnames(df, old = c(max.col, min.col, datecol.name), new = c('T_max_temp', 'T_min_temp', 'Datetime_temp'))
data.table::setorder(df, Datetime_temp)
#Create a column with the tmax and tmin values shifted one day to the future
df[,T_min_lag := data.table::shift(T_min_temp, 1, type = 'lag')]
df[,T_max_lag := data.table::shift(T_max_temp, 1, type = 'lag')]
#Check if extreme values are within plausible range
df[T_max_temp > 43 | T_min_temp > 43, flag_extreme := 1]
df[T_max_temp < -43 | T_min_temp < -43, flag_extreme := 1] #from Klimareport, 2018
#Check relationship between max min and mean temperature
df[T_max_temp < T_min_temp, flag_consist := 1]
#Number of consecutive days with the same value
#####NAs are counted as consecutive occurences :(
df[,count_max := .N, by = data.table::rleid(T_max_temp)]
df[count_max > 7, flag_cons_max := 1]
df[,count_min := .N, by = data.table::rleid(T_min_temp)]
df[count_min > 7, flag_cons_min := 1]
#Comparison with temperatures from previous day
df[T_max_temp < T_min_lag, flag_min_lag := 1]
df[T_max_lag < T_min_temp, flag_max_lag := 1] #from estevez, 2011
#Outliers (simple test if only few years and elaborate test if many years are present)
#Spatial test
#Identify rows with detected anomalies
df[,drop_bool := rowSums(.SD, na.rm = T), .SDcols = names(df) %like% 'flag']
#Drop unneded columns
df[,c("count_max", "count_min", "T_min_lag", "T_max_lag") := NULL]
#Drop rows with detected anomalies
df.clean <- df[drop_bool == 0, .SD, .SDcols = !names(df) %like% 'flag|bool']
rows.dropped <- nrow(tidyr::drop_na(df, T_max_temp, T_min_temp)) - nrow(df.clean)
rows.dropped <- ifelse(rows.dropped < 0, 0, rows.dropped)
if (verbose) print(paste0('Amount of rows dropped: ', rows.dropped))
df.clean <- rebecka::complete_dateseq(df.clean, datecol.name = 'Datetime_temp')
data.table::setnames(df.clean, old = c('T_max_temp', 'T_min_temp', 'Datetime_temp'), new = c(max.col, min.col, datecol.name))
if (return == 'raw') return(df)
if (return == 'clean') return(df.clean)
}
#' Quality check of temperature values from meteorological stations. Work in progress
#'
#' This function allows you to clean temperature timeseries in a data.frame
#' @param df data.frame, A data.frame with a column in the Date format, and two
#' columns for daily minimum and maximum temperature respectively
#' @param value.cols string, name of the column that should be checked for unrealistic values
#' @param f numeric, with of the standard deviation window that is used to discard values
#' @param f.pers numeric, work in progress
#' @param roll.window numeric, width of the window to calculate rolling standard deviation
#' @param return string, one of 'raw' or 'clean', determines the type of output of the function
#' @param timestep integer, number of seconds between to observations
#' @import data.table
#' @export
#' @examples
#' check_quality_paper(df = df.daily)
check_quality_sigma <- function(df, value.cols, f = 4, f.pers = 100, roll.window = 5, return = 'clean',
timestep = NULL) {
stopifnot(!missing(value.cols), all(value.cols %in% colnames(df)))
datecol <- rebecka::detect_datecol(df, format = 'Date', return = 'name')
stopifnot(length(datecol) == 1)
df <- rebecka::complete_dateseq(df, datecol.name = datecol, timestep = timestep)
datecol.name <- as.name(datecol)
result <- df %>% dplyr::arrange(!!datecol.name)
for (value.col in value.cols) {
col.name <- stringr::str_c(value.col, '_drop_bool')
value.col <- as.name(value.col)
df.base <- df %>%
dplyr::arrange(!!datecol.name) %>%
dplyr::mutate(year = lubridate::year(!!datecol.name),
month = lubridate::month(!!datecol.name),
step = !!value.col - dplyr::lag(!!value.col, 1),
roll_sd = zoo::rollapply(!!value.col, roll.window, sd, fill = NA))
extreme.test <- df.base %>%
dplyr::group_by(month) %>%
dplyr::mutate(mean = mean(!!value.col, na.rm = T),
stabw = sd(!!value.col, na.rm = T),
thresh_down = floor(mean - (f * stabw)),
thresh_up = ceiling(mean + (f * stabw)),
bool.extreme = ifelse(!!value.col >= thresh_down & !!value.col <= thresh_up, FALSE, TRUE)) %>%
dplyr::ungroup()
step.test <- df.base %>%
dplyr::group_by(month) %>%
dplyr::mutate(mean = mean(step, na.rm = T),
stabw = sd(step, na.rm = T),
thresh_down = mean - (f * stabw),
thresh_up = mean + (f * stabw),
bool.step = ifelse(step >= thresh_down & step <= thresh_up, FALSE, TRUE)) %>%
ungroup()
persistence.test <- df.base %>%
dplyr::group_by(year, month) %>%
dplyr::mutate(stabw_month = sd(!!value.col, na.rm = T)) %>%
dplyr::ungroup() %>%
dplyr::group_by(month) %>%
dplyr::mutate(stabw_mean = mean(stabw_month, na.rm = T),
stabw_stabw = sd(stabw_month, na.rm = T),
thresh_down = 0,
thresh_up = stabw_mean + (f.pers * stabw_stabw),
bool.pers = ifelse(roll_sd >= thresh_down & roll_sd <= thresh_up, FALSE, TRUE)) %>%
dplyr::ungroup()
df.bool <- df.base %>%
dplyr::left_join(dplyr::select(extreme.test, !!datecol.name, bool.extreme), by = datecol) %>%
dplyr::left_join(dplyr::select(step.test, !!datecol.name, bool.step), by = datecol) %>%
dplyr::left_join(dplyr::select(persistence.test, !!datecol.name, bool.pers), by = datecol) %>%
dplyr::mutate(drop_bool = rowSums(dplyr::select(., dplyr::contains('bool'))))
result <- result %>%
dplyr::left_join(dplyr::select(df.bool, !!datecol.name, drop_bool), by = datecol) %>%
dplyr::rename(!!col.name := drop_bool)
}
if (return == 'raw') result <- result
if (return == 'clean') {
result <- result %>%
dplyr::filter_at(vars(contains('drop_bool')), all_vars(. == 0)) %>%
dplyr::select(-contains('drop_bool'))
}
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.