rebecka_package: Utility functions used in the REBECKA project

Documented in check_quality check_quality_sigma

#' Quality check of temperature values from meteorological stations
#'
#' This function allows you to clean temperature timeseries in a data.frame
#' @param df data.frame, A data.frame with a column in the Date format, and two
#' columns for daily minimum and maximum temperature respectively
#' @param max.col string, name of the column with daily maximum temperature
#' @param min.col string, name of the column with daily minimum temperature
#' @param return string, one of 'clean' or 'raw'. 'Raw' will give the dataframe that is
#' used to check for errors while 'clean' will give the dataframe with wrong values removed and a complete
#' timeseries between minimum and maximum date.
#' @param verbose logical, if true the amount of rows that are dropped for a dataframe are printed out
#' @import data.table
#' @export
#' @examples
#' check_quality(df = df.daily)

check_quality <- function(df, max.col = 'T_max', min.col = 'T_min', return = 'clean', verbose = F) {

  if (!inherits(df, 'data.table')) { df <- data.table::setDT(df) }

  datecol.name <- rebecka::detect_datecol(df, return = 'name', format = 'Date')

  data.table::setnames(df, old = c(max.col, min.col, datecol.name), new = c('T_max_temp', 'T_min_temp', 'Datetime_temp'))

  data.table::setorder(df, Datetime_temp)

  #Create a column with the tmax and tmin values shifted one day to the future
  df[,T_min_lag := data.table::shift(T_min_temp, 1, type = 'lag')]
  df[,T_max_lag := data.table::shift(T_max_temp, 1, type = 'lag')]

  #Check if extreme values are within plausible range
  df[T_max_temp > 43 | T_min_temp > 43, flag_extreme := 1]
  df[T_max_temp < -43 | T_min_temp < -43, flag_extreme := 1]  #from Klimareport, 2018

  #Check relationship between max min and mean temperature
  df[T_max_temp < T_min_temp, flag_consist := 1]

  #Number of consecutive days with the same value
  #####NAs are counted as consecutive occurences :(
  df[,count_max := .N, by = data.table::rleid(T_max_temp)]
  df[count_max > 7, flag_cons_max := 1]

  df[,count_min := .N, by = data.table::rleid(T_min_temp)]
  df[count_min > 7, flag_cons_min := 1]

  #Comparison with temperatures from previous day
  df[T_max_temp < T_min_lag, flag_min_lag := 1]
  df[T_max_lag < T_min_temp, flag_max_lag := 1]   #from estevez, 2011

  #Outliers (simple test if only few years and elaborate test if many years are present)

  #Spatial test

  #Identify rows with detected anomalies
  df[,drop_bool := rowSums(.SD, na.rm = T), .SDcols = names(df) %like% 'flag']

  #Drop unneded columns
  df[,c("count_max", "count_min", "T_min_lag", "T_max_lag") := NULL]

  #Drop rows with detected anomalies
  df.clean <- df[drop_bool == 0, .SD, .SDcols = !names(df) %like% 'flag|bool']

  rows.dropped <- nrow(tidyr::drop_na(df, T_max_temp, T_min_temp)) - nrow(df.clean)
  rows.dropped <- ifelse(rows.dropped < 0, 0, rows.dropped)
  if (verbose) print(paste0('Amount of rows dropped: ', rows.dropped))

  df.clean <- rebecka::complete_dateseq(df.clean, datecol.name = 'Datetime_temp')

  data.table::setnames(df.clean, old = c('T_max_temp', 'T_min_temp', 'Datetime_temp'), new = c(max.col, min.col, datecol.name))

  if (return == 'raw') return(df)
  if (return == 'clean') return(df.clean)
}

#' Quality check of temperature values from meteorological stations. Work in progress
#'
#' This function allows you to clean temperature timeseries in a data.frame
#' @param df data.frame, A data.frame with a column in the Date format, and two
#' columns for daily minimum and maximum temperature respectively
#' @param value.cols string, name of the column that should be checked for unrealistic values
#' @param f numeric, with of the standard deviation window that is used to discard values
#' @param f.pers numeric, work in progress
#' @param roll.window numeric, width of the window to calculate rolling standard deviation
#' @param return string, one of 'raw' or 'clean', determines the type of output of the function
#' @param timestep integer, number of seconds between to observations
#' @import data.table
#' @export
#' @examples
#' check_quality_paper(df = df.daily)

check_quality_sigma <- function(df, value.cols, f = 4, f.pers = 100, roll.window = 5, return = 'clean',
                                timestep = NULL) {

  stopifnot(!missing(value.cols), all(value.cols %in% colnames(df)))

  datecol <- rebecka::detect_datecol(df, format = 'Date', return = 'name')
  stopifnot(length(datecol) == 1)

  df <- rebecka::complete_dateseq(df, datecol.name = datecol, timestep = timestep)

  datecol.name <- as.name(datecol)

  result <- df %>% dplyr::arrange(!!datecol.name)

  for (value.col in value.cols) {

    col.name <- stringr::str_c(value.col, '_drop_bool')
    value.col <- as.name(value.col)

    df.base <- df %>%

      dplyr::arrange(!!datecol.name) %>%

      dplyr::mutate(year = lubridate::year(!!datecol.name),
                    month = lubridate::month(!!datecol.name),
                    step = !!value.col - dplyr::lag(!!value.col, 1),
                    roll_sd = zoo::rollapply(!!value.col, roll.window, sd, fill = NA))

    extreme.test <- df.base %>%
      dplyr::group_by(month) %>%
      dplyr::mutate(mean = mean(!!value.col, na.rm = T),
                    stabw = sd(!!value.col, na.rm = T),
                    thresh_down = floor(mean - (f * stabw)),
                    thresh_up = ceiling(mean + (f * stabw)),
                    bool.extreme = ifelse(!!value.col >= thresh_down & !!value.col <= thresh_up, FALSE, TRUE)) %>%

      dplyr::ungroup()

    step.test <- df.base %>%
      dplyr::group_by(month) %>%
      dplyr::mutate(mean = mean(step, na.rm = T),
                    stabw = sd(step, na.rm = T),
                    thresh_down = mean - (f * stabw),
                    thresh_up = mean + (f * stabw),
                    bool.step = ifelse(step >= thresh_down & step <= thresh_up, FALSE, TRUE)) %>%

      ungroup()

    persistence.test <- df.base %>%
      dplyr::group_by(year, month) %>%
      dplyr::mutate(stabw_month = sd(!!value.col, na.rm = T)) %>%

      dplyr::ungroup() %>%

      dplyr::group_by(month) %>%
      dplyr::mutate(stabw_mean = mean(stabw_month, na.rm = T),
                    stabw_stabw = sd(stabw_month, na.rm = T),
                    thresh_down = 0,
                    thresh_up = stabw_mean + (f.pers * stabw_stabw),
                    bool.pers = ifelse(roll_sd >= thresh_down & roll_sd <= thresh_up, FALSE, TRUE)) %>%

      dplyr::ungroup()

    df.bool <- df.base %>%
      dplyr::left_join(dplyr::select(extreme.test, !!datecol.name, bool.extreme), by = datecol) %>%
      dplyr::left_join(dplyr::select(step.test, !!datecol.name, bool.step), by = datecol) %>%
      dplyr::left_join(dplyr::select(persistence.test, !!datecol.name, bool.pers), by = datecol) %>%

      dplyr::mutate(drop_bool = rowSums(dplyr::select(., dplyr::contains('bool'))))

    result <- result %>%
      dplyr::left_join(dplyr::select(df.bool, !!datecol.name, drop_bool), by = datecol) %>%
      dplyr::rename(!!col.name := drop_bool)
  }


  if (return == 'raw') result <- result
  if (return == 'clean') {
    result <- result %>%
      dplyr::filter_at(vars(contains('drop_bool')), all_vars(. == 0)) %>%
      dplyr::select(-contains('drop_bool'))
  }

  return(result)

}