FluxSynthU: Suite of functions to analyze eddy covariance data

Documented in fTimestampQC

#' This function looks at a time series of data to determine whether there are missing or duplicate datetime entries.

#' @export
#' @title Find duplicate or missing (half-)hours in a timestamped dataframe
#' @param dat a dataframe containing the following timestamp columns: year, day of year, and hour
#' @param col_yr a character object specifying the name of the YEAR column
#' @param col_doy a character object specifying the name of the DOY column
#' @param col_hour a character object specifying the name of the HOUR column



fTimestampQC <- function(dat, col_yr, col_doy, col_hour) {

  #### BEGIN FUNCTIONS

  # Logical test to determine whether the hour column contains any half-hours
  # (hourly datafiles will return a vector of all TRUE)
  fCheckInteger <- function(x) {
    x%%1==0
  }

  # fLeapYr is a function that identifies which years are leapyears within a given date (Year) range
  fLeapYr <- function(year1, year2){
    return(c(year1:year2)[((c(year1:year2) %% 4 == 0) & (c(year1:year2) %% 100 != 0)) | (c(year1:year2) %% 400 == 0)])}

  #### END FUNCTIONS



  # Check whether your dataframe contains hourly or half-hourly data
  test_for_hourly_dat <- fCheckInteger(unique(dat[,grepl(col_hour, colnames(dat))]))

  if (!FALSE %in% test_for_hourly_dat) {
    # TRUE for hourly data
    # Create a sequence of hours from 0 to 23, increasing every 1 hour (n = 24).
    HH_seq <- seq(0,23,1)
  } else {
    # ELSE for half-hourly data
    # Create a sequence of half-hours from 0 to 23.5, increasing every 0.5 hour (n = 48).
    HH_seq <- seq(0,23.5,0.5)
  }


  # Determine which years are available within the dataframe
  years_of_record <- unique(dat[,grepl(col_yr, colnames(dat))])

  start_year <- years_of_record[1]
  end_year <- tail(years_of_record,1)

  # From years_of_record, determine which years are leapyears
  leap_years_of_record <- fLeapYr(years_of_record[1], tail(years_of_record, 1))

  # Build an empty list to store the flagged rows for each year (these will get added to a dataframe later)
  dat.list <- list()

  # Build empty dataframe to store the output
  df.report <- data.frame(Year = years_of_record)
  df.report[,"DoY flagged"] <- 0



  # Loop through each year, subsetting dataframe every day.
  # Within a given day, ensure that the 'Hour' column exactly matches a sequence from 0-23 every 1 hr (or 0-23.5 every 0.5 hrs)
  for (process_year in start_year:end_year) {
    # Index your year column based on process_year
    test_for_year_i <- dat[,grepl(col_yr, colnames(dat))] == process_year

    # Then subset dat by year
    dat.i <- dat[test_for_year_i,]

    # Identify whether process_year is a leapyear
    test_for_leapyear <- process_year %in% leap_years_of_record

    # Set the final day in a 365 (or 366) day calendar
    if (test_for_leapyear) {
      nth_doy <- 366
    } else {
      nth_doy <- 365
    }

    DD_seq <- seq(1,nth_doy,1)



    flagged_doy <- dat.i %>%
      select(col_doy, col_hour) %>%
      rename('DoY' = col_doy, 'Hour' = col_hour) %>%
      arrange(DoY) %>%
      group_by(DoY) %>%
      summarise(.groups = 'drop',
                matching_seq = is.logical(all.equal(Hour, HH_seq))) %>%
      subset(., matching_seq != TRUE) %>%
      pull(DoY)


    # Add the list of flagged doys to the current process year list
    if (is.null(flagged_doy)) {
      dat.list[[process_year]] <- "NA"
    } else {
      dat.list[[process_year]] <- flagged_doy
    }


    test_for_report_year <- df.report$Year == process_year

    # Gather the number of incorrect date values for the process year
    incorrect.doys <- noquote(paste(dat.list[[process_year]], collapse = ','))

    # Calculate the total number of incorrect days for a given year
    no.incorrect.doys <- nchar(gsub('[^,]+', '',
                                    gsub(',(?=,)|(^,|,$)', '',
                                         gsub('(Null){1,}', '', incorrect.doys), perl=TRUE))) + 1L


    if (no.incorrect.doys > 10) {
      df.report[test_for_report_year,2] <- ">10 incorrect dates"
    } else {
      df.report[test_for_report_year,2] <- incorrect.doys
    }


  }

  # Assign NA to empty rows
  df.report[,"DoY flagged"][df.report[,"DoY flagged"] == ""] <- NA


  # Let the user know if there were more than 2 years of bad data
  if (sum(!is.na(df.report[,"DoY flagged"])) > 2) {
    message('')
    message('More than 2 years contain data with incorrect timestamps!')
    message('  This could be related to:')
    message('   - Missing days')
    message('   - Missing (half-)hours')
    message('   - Days with extra (half-)hours')
    message('   - Daylight Savings Time')
    message('')
    message('Be sure to check the highlighted dates!')
    message('')
  }

  return(df.report)

}