##' @title Basic data quality control
##'
##' @description A helper function to perform minimal quality control on the data.
##' For now, this script only performs action on the three main temperature variables.
##'
##' @param dat a \code{tbl_df} generated form the \code{senamhiR} package
##'
##' @return a \code{tbl_df}
##'
##' @importFrom dplyr filter select starts_with
##' @importFrom tibble add_column
##' @importFrom rlang .data
##' @importFrom stats sd
##'
##' @export
##'
##' @author Conor I. Anderson
##'
qc <- function(dat) {
attrs_to_append <- append(attributes(dat)[!names(attributes(dat)) %in% c('rownames', 'names', 'class')],
list(`QC Date` = Sys.Date()))
if (inherits(dat, "character") & !inherits(dat, "data.frame")) {
if (length(dat) > 1L) {
stop("Sorry, for now this script can only process one station at a time.")
} else {
dat <- download_data(dat)
}
}
if (length(unique(format(dat$Fecha, format = "%Y"))) == 1) {
stop("You've passed a one-year table. We need (many) additional years of data for context.")
}
if (grepl("Observations", colnames(dat)[ncol(dat)])) {
observations <- select(dat, ncol(dat)) %>% unlist()
} else {
observations <- rep(NA, nrow(dat))
}
if (attr(dat, "Configuration") != "H") {
# Try to detect decimal place shifts.
maxshifts <- which(dat$`Tmax (C)` > 50 | dat$`Tmax (C)` < -50)
minshifts <- which(dat$`Tmin (C)` > 50 | dat$`Tmin (C)` < -50)
if (length(maxshifts) > 0) {
for (i in 1:length(maxshifts)) {
bad_table <- select(dat, .data$Fecha, var = "Tmax (C)")
fixes <- .fix_bad_data(bad_table, maxshifts[i], "Tmax", "dps")
dat$`Tmax (C)`[maxshifts[i]] <- unlist(fixes[1])
existingobs <- if (!is.na(observations[maxshifts[i]]) && observations[maxshifts[i]] != '') paste(observations[maxshifts[i]], ifelse((unlist(fixes[2]) != ''), "/ ", "")) else ""
observations[maxshifts[i]] <- paste0(existingobs, unlist(fixes[2]))
}
}
if (length(minshifts) > 0) {
for (i in 1:length(minshifts)) {
bad_table <- select(dat, .data$Fecha, var = "Tmin (C)")
fixes <- .fix_bad_data(bad_table, minshifts[i], "Tmin", "dps")
dat$`Tmin (C)`[minshifts[i]] <- unlist(fixes[1])
existingobs <- if (!is.na(observations[minshifts[i]]) && observations[minshifts[i]] != '') paste(observations[minshifts[i]], ifelse((unlist(fixes[2]) != ''), "/ ", "")) else ""
observations[minshifts[i]] <- paste0(existingobs, unlist(fixes[2]))
}
}
# Try to detect Tmin < Tmax (for now, we'll throw away these days)
minmaxerr <- which(dat$`Tmax (C)` < dat$`Tmin (C)`)
if (length(minmaxerr) > 0) {
for (i in 1:length(minmaxerr)) {
# First check Tmax
bad_table <- select(dat, .data$Fecha, var = "Tmax (C)")
fixes <- .fix_bad_data(bad_table, minmaxerr[i], "Tmax", "mme")
dat$`Tmax (C)`[minmaxerr[i]] <- unlist(fixes[1])
existingobs <- if (!is.na(observations[minmaxerr[i]]) && observations[minmaxerr[i]] != '') paste(observations[minmaxerr[i]], ifelse((unlist(fixes[2]) != ''), "/ ", "")) else ""
observations[minmaxerr[i]] <- paste0(existingobs, unlist(fixes[2]))
# Repeat the same for Tmin
bad_table <- select(dat, .data$Fecha, var = "Tmin (C)")
fixes <- .fix_bad_data(bad_table, minmaxerr[i], "Tmin", "mme")
dat$`Tmin (C)`[minmaxerr[i]] <- unlist(fixes[1])
existingobs <- if (!is.na(observations[minmaxerr[i]]) && observations[minmaxerr[i]] != '') paste(observations[minmaxerr[i]], ifelse((unlist(fixes[2]) != ''), "/ ", "")) else ""
observations[minmaxerr[i]] <- paste0(existingobs, unlist(fixes[2]))
}
}
# Recalculate Tmean and add observations
dat$`Tmean (C)` <- round((dat$`Tmax (C)` + dat$`Tmin (C)`)/2,1)
} else {
if (any(grepl("Nivel[0-9]{2}", names(dat)))) {
# Try to find bad river levels
levels <- select(dat, starts_with("Nivel"))
ranges <- apply(levels, 1, function(x) {max(x) - min(x)})
while (max(ranges, na.rm = TRUE) > 10 * mean(ranges, na.rm = TRUE)) {
index <- which.max(ranges)
slice <- index + -2:2
current_tab <- data.matrix(levels[slice,])
std_tab <- (current_tab - mean(current_tab))/sd(current_tab)
if (sum(std_tab > 1 | std_tab < -1) == 1) {
if (sum(std_tab > 1) == 1) {
bad_val <- which.max(std_tab)
coords <- which(levels[slice,] == current_tab[bad_val], arr.ind = TRUE)
ul <- mean(current_tab[-bad_val]) + 1.5 * sd(current_tab[-bad_val])
ll <- mean(current_tab[-bad_val]) - 1.5 * sd(current_tab[-bad_val])
if (current_tab[bad_val] / 10 <= ul & current_tab[bad_val] / 10 >= ll) {
observations[index] <- paste("Level dps:", current_tab[bad_val], "->", current_tab[bad_val] / 10)
levels[slice,][coords[1], coords[2]] <- current_tab[bad_val] / 10
} else {
observations[index] <- paste("Level err:", current_tab[bad_val], "-> NA")
levels[slice,][coords[1], coords[2]] <- NA
}
} else {
bad_val <- which.min(std_tab)
coords <- which(levels[slice,] == current_tab[bad_val], arr.ind = TRUE)
ul <- mean(current_tab[-bad_val]) + 1.5 * sd(current_tab[-bad_val])
ll <- mean(current_tab[-bad_val]) - 1.5 * sd(current_tab[-bad_val])
if (current_tab[bad_val] * 10 <= ul & current_tab[bad_val] * 10 >= ll) {
observations[index] <- paste("Level dps:", current_tab[bad_val], "->", current_tab[bad_val] *10)
levels[slice,][coords[1], coords[2]] <- current_tab[bad_val] * 10
} else {
observations[index] <- paste("Level err:", current_tab[bad_val], "-> NA")
levels[slice,][coords[1], coords[2]] <- NA
}
}
ranges[index] <- apply(levels[index,], 1, function(x) {max(x) - min(x)})
} else {
break
}
}
# Replace all of the old data
dat[,grep("Nivel", names(dat))] <- levels
}
if (any(grepl("Nivel Medio", names(dat)))) {
observations <- rep('', nrow(dat))
observations[!is.na(dat$`Nivel Medio (m)`) &
dat$`Nivel Medio (m)` < 0] <- paste(
"Level err:",
dat$`Nivel Medio (m)`[!is.na(dat$`Nivel Medio (m)`) &
dat$`Nivel Medio (m)` < 0],
"-> NA")
dat$`Nivel Medio (m)`[dat$`Nivel Medio (m)` < 0] <- NA
while (any(dat$`Nivel Medio (m)` > 10 * mean(dat$`Nivel Medio (m)`, na.rm = TRUE), na.rm = TRUE)) {
co <- which.max(dat$`Nivel Medio (m)`)
observations[co] <- paste("Level err:",
dat$`Nivel Medio (m)`[co],
"-> NA")
dat$`Nivel Medio (m)`[co] <- NA
}
}
}
# Add observations column to data
observations[is.na(observations)] <- ''
dat <- add_column(dat, Observations = observations)
attributes(dat) <- append(attributes(dat), attrs_to_append)
rownames(dat) <- NULL
dat
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.