#' This function looks at a time series of data to determine whether there are missing or duplicate datetime entries.
#' @export
#' @title Find duplicate or missing (half-)hours in a timestamped dataframe
#' @param dat a dataframe containing the following timestamp columns: year, day of year, and hour
#' @param col_yr a character object specifying the name of the YEAR column
#' @param col_doy a character object specifying the name of the DOY column
#' @param col_hour a character object specifying the name of the HOUR column
fTimestampQC <- function(dat, col_yr, col_doy, col_hour) {
#### BEGIN FUNCTIONS
# Logical test to determine whether the hour column contains any half-hours
# (hourly datafiles will return a vector of all TRUE)
fCheckInteger <- function(x) {
x%%1==0
}
# fLeapYr is a function that identifies which years are leapyears within a given date (Year) range
fLeapYr <- function(year1, year2){
return(c(year1:year2)[((c(year1:year2) %% 4 == 0) & (c(year1:year2) %% 100 != 0)) | (c(year1:year2) %% 400 == 0)])}
#### END FUNCTIONS
# Check whether your dataframe contains hourly or half-hourly data
test_for_hourly_dat <- fCheckInteger(unique(dat[,grepl(col_hour, colnames(dat))]))
if (!FALSE %in% test_for_hourly_dat) {
# TRUE for hourly data
# Create a sequence of hours from 0 to 23, increasing every 1 hour (n = 24).
HH_seq <- seq(0,23,1)
} else {
# ELSE for half-hourly data
# Create a sequence of half-hours from 0 to 23.5, increasing every 0.5 hour (n = 48).
HH_seq <- seq(0,23.5,0.5)
}
# Determine which years are available within the dataframe
years_of_record <- unique(dat[,grepl(col_yr, colnames(dat))])
start_year <- years_of_record[1]
end_year <- tail(years_of_record,1)
# From years_of_record, determine which years are leapyears
leap_years_of_record <- fLeapYr(years_of_record[1], tail(years_of_record, 1))
# Build an empty list to store the flagged rows for each year (these will get added to a dataframe later)
dat.list <- list()
# Build empty dataframe to store the output
df.report <- data.frame(Year = years_of_record)
df.report[,"DoY flagged"] <- 0
# Loop through each year, subsetting dataframe every day.
# Within a given day, ensure that the 'Hour' column exactly matches a sequence from 0-23 every 1 hr (or 0-23.5 every 0.5 hrs)
for (process_year in start_year:end_year) {
# Index your year column based on process_year
test_for_year_i <- dat[,grepl(col_yr, colnames(dat))] == process_year
# Then subset dat by year
dat.i <- dat[test_for_year_i,]
# Identify whether process_year is a leapyear
test_for_leapyear <- process_year %in% leap_years_of_record
# Set the final day in a 365 (or 366) day calendar
if (test_for_leapyear) {
nth_doy <- 366
} else {
nth_doy <- 365
}
DD_seq <- seq(1,nth_doy,1)
flagged_doy <- dat.i %>%
select(col_doy, col_hour) %>%
rename('DoY' = col_doy, 'Hour' = col_hour) %>%
arrange(DoY) %>%
group_by(DoY) %>%
summarise(.groups = 'drop',
matching_seq = is.logical(all.equal(Hour, HH_seq))) %>%
subset(., matching_seq != TRUE) %>%
pull(DoY)
# Add the list of flagged doys to the current process year list
if (is.null(flagged_doy)) {
dat.list[[process_year]] <- "NA"
} else {
dat.list[[process_year]] <- flagged_doy
}
test_for_report_year <- df.report$Year == process_year
# Gather the number of incorrect date values for the process year
incorrect.doys <- noquote(paste(dat.list[[process_year]], collapse = ','))
# Calculate the total number of incorrect days for a given year
no.incorrect.doys <- nchar(gsub('[^,]+', '',
gsub(',(?=,)|(^,|,$)', '',
gsub('(Null){1,}', '', incorrect.doys), perl=TRUE))) + 1L
if (no.incorrect.doys > 10) {
df.report[test_for_report_year,2] <- ">10 incorrect dates"
} else {
df.report[test_for_report_year,2] <- incorrect.doys
}
}
# Assign NA to empty rows
df.report[,"DoY flagged"][df.report[,"DoY flagged"] == ""] <- NA
# Let the user know if there were more than 2 years of bad data
if (sum(!is.na(df.report[,"DoY flagged"])) > 2) {
message('')
message('More than 2 years contain data with incorrect timestamps!')
message(' This could be related to:')
message(' - Missing days')
message(' - Missing (half-)hours')
message(' - Days with extra (half-)hours')
message(' - Daylight Savings Time')
message('')
message('Be sure to check the highlighted dates!')
message('')
}
return(df.report)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.