R/main.R

Defines functions chk.hydat Get_drought_metrics Get_flood_metrics Get_ann_mean_flow percentile.ranked Threshold_Build bday_calculation negbin hurdle_test mk_test

Documented in bday_calculation Get_ann_mean_flow Get_flood_metrics hurdle_test mk_test negbin percentile.ranked Threshold_Build

################################################################################
#' mk_test
#'
#' @description
#' mk_test performs a Mann-Kendall Test and returns multiple test statistics given
#' a variable y and a variable x in vector form.
#'
#' @param y_var a vector of numbers representing the y variable
#' @param x_var a vector of numbers representing the x variable
#' @param z_low a number representing the z-value for the lower confidence level,
#' defaulted to 0.52, equivalent to the 70 percent confidence threshold
#' @param z_high a number representing the z-value for the upper confidence level,
#' defaulted to 1.28, equivalent to the 90 percent confidence threshold
#' @param keep_z a boolean for whether or not z-value will be included in the
#' output, defaulted to FALSE
#'
#' @return a dataframe of the form || slope || intercept || ConfidenceLevel || z_value ||
#'
#' @details
#'1. y and x variables should have equal length
#'2. Although the function does allow input y to contain duplicate values, please
#'   take into account that an excess amount of duplicates within y will likely
#'   decrease the accuracy of the result generated by the Mann-Kendall test. For
#'   data sets with too many duplicates in the dependant variable, consider
#'   testing for the trend using alternative methods, such as generalized linear
#'   model. Refer to other functions such as hurdle_test and negbin for more
#'   information.
#'
#' @export
mk_test <- function(y_var, x_var, z_low=0.52, z_high=1.28, keep_z=FALSE){
  sen <- zyp.sen(y_var~x_var)
  slope <- round(sen$coefficients[[2]],2)
  intercept <- round(sen$coefficients[[1]],2)
  corr<- cor.test(y_var, x_var, method = "kendall", alternative = "two.sided", exact = FALSE)
  Z <-abs(corr$statistic[["z"]])
  CATTrend <- case_when(Z>=z_high ~ "Confident",
                        Z>=z_low & Z<z_high ~ "Likely",
                        Z<z_low ~ "Uncertain")
  years.for.trend <- sum(!is.na(y_var))
  ret<- data.frame(slope=slope,intercept=intercept,CATTrend=CATTrend,
                   years.for.trend=years.for.trend, z_value=Z)
  if(!keep_z){
    ret <- ret %>% dplyr::select(-z_value)
  }
  return(ret)
}

################################################################################
#' hurdle_test
#'
#' @description
#' hurdle_test performs a hurdle test and returns multiple test statistics given
#' a variable y and a variable x. Hurdle test can be used to fit a linear model
#' when a data set contains an excess amount of zeros.
#'
#' @param y_var a vector of numbers representing the y variable
#' @param x_var a vector of numbers representing the x variable
#' @param zero_fam a character string representing the model family for the zero
#' portion of the data. Defaulted to "binomial" indicating binomial family
#' @param count_dist a character string representing the count distribution.
#' Defaulted to 'negbin', the negative binomial distribution
#' @param zero_dist a character string representing the zero distribution.
#' Defaulted to "binomial", the binomial distribution
#' @param link a character string representing the link for binomial zero hurdle
#' model. Defaulted to 'logit', the logarithm function
#'
#' @return a dataframe of the form || slope || intercept || ConfidenceLevel ||,
#' or a character string if the test is not applicable;
#'
#' @details
#' The hurdle test deals with zero and non-zero portions separatly using two
#' different models: the negative binomial distribution for the non-zero portion
#' and the binomial distribution for the zero portion. Confidence intervals are
#' created to report the level of confidence in the trend detected. A generalized
#' linear model (glm) will be then established between the two distributions with
#' a logarithm link function. The slope and intercept of the resulting model will
#' be recorded and returned.
#' Things to take notice of:
#' 1. y and x variables need to have the same length
#' 2. Although the detail section only describes the condition with a negative
#'    binomial count distribution and a binomial zero distribution, this function
#'    can be used for other distributions as well.
#' 3. Do note that this function does not check for the accuracy of the fitted
#'    model, neither does it provide any recommendations on what distribution
#'    fits the best. In order to use other families, it is highly suggested to
#'    look at the concept of general linear models beforehand.
#' 4. The program will return an error if the y variable does not contain any zeros.
#'    In order to calculate the trend using this function, make sure that there
#'    is at least one zero value in the y variable. If the error message "Data
#'    cannot be fitted using hurdle model" appears, user might want to consider
#'    using negative binomial model (provided as the negbin() function in the
#'    same package) for cases with excess amount of zeros, or Mann-Kendall Test
#'    for cases with limited ties and zeros.
#'
#' @export
hurdle_test <- function(y_var, x_var, zero_fam="binomial", count_dist="negbin",
                        zero_dist="binomial", link="logit"){
# Are we confident there is a trend?
# Count portion of hurdle
  y_count <- y_var[y_var>0]
  x_count <- x_var[y_var>0]
  model.count <- tryCatch(MASS::glm.nb(y_count~x_count),
                          error=function(e){return("A")})
  low.c<- ifelse(is.character(model.count), NA, exp(confint(profile(model.count), level=0.9))[2,1])
  hi.c <- ifelse(is.character(model.count), NA, exp(confint(profile(model.count), level=0.9))[2,2])
# Zero portion of hurdle
  zero_indicator <- !I(y_var==0)
  model.zero <- tryCatch(glm(zero_indicator~x_var, family=zero_fam),
                         error=function(e){return("A")})
  low.z<-ifelse(is.character(model.zero), NA, exp(confint(profile(model.zero), level=0.9))[2,1])
  hi.z<-ifelse(is.character(model.zero), NA, exp(confint(profile(model.zero), level=0.9))[2,2])
  if(!is.na(low.c)&!is.na(hi.c)&!is.na(low.z)&!is.na(hi.z)){
# combined confidence test @ 70% and 90% confidence
    pass <- ifelse(all(low.c*low.z<=1, hi.c*hi.z>=1), "Maybe?", "Confident")
    if (pass == "Maybe?"){
      low.c<- exp(confint(profile(model.count), level=0.7))[2,1]
      hi.c <- exp(confint(profile(model.count), level=0.7))[2,2]
      low.z<-exp(confint(profile(model.zero), level=0.7))[2,1]
      hi.z<-exp(confint(profile(model.zero), level=0.7))[2,2]
      pass <- ifelse(all(low.c*low.z<=1, hi.c*hi.z>=1), "Uncertain", "Likely")
    }
# Get slope and intercept from hurdle
    model <- hurdle(y_var~x_var, dist=count_dist, zero.dist=zero_dist, link=link)
    fitted <- unname(model$fitted.values)
    slope <- round((fitted[length(fitted)] - fitted[1])/
                     (max(x_var) - min(x_var)),2)
    intercept <- round((fitted[1]-min(x_var)*((fitted[length(fitted)] - fitted[1])/
                          (max(x_var) - min(x_var)))),2)
    years.for.trend <- sum(!is.na(y_var))
    data.frame(slope=slope,intercept=intercept,CATTrend=pass,years.for.trend=years.for.trend)
  }else{
    "Error: Data cannot be fitted using hurdle model"
    data.frame(slope=NA,intercept=NA,CATTrend=NA,years.for.trend=NA)
  }
}

################################################################################
#' negbin
#'
#' @description
#' negbin fits a negative binomial model and returns multiple test statistics
#' given a variable y and a variable x.
#'
#' @param y_var a vector of numbers representing the y variable
#' @param x_var a vector of numbers representing the x variable
#'
#' @return a dataframe of the form || slope || intercept || ConfidenceLevel ||
#'
#' @details
#' The negbin function fits a negative binomial model to a sets of x and y variables
#' Confidence intervals are created to measure the level of confidence in the trend.
#' The slope and intercept of the resulting model are recorded and returned.
#' Things to take notice:
#' 1. y and x variables need to have the same length
#' 2. Theoretically saying, negbin is the most versatile among the three trend tests.
#'    The preferred condition to use negbin function is when there is neither an
#'    excess amount of zeros, nor can the trend be calculated by Mann-Kendall test.
#'    For those conditions, consider using the two other functions instead.
#'
#' @export
negbin <- function(y_var, x_var){
  model <- tryCatch(glm.nb(y_var~x_var),
                           error=function(e){return("A")})
  low<-ifelse(is.character(model), NA, exp(confint(profile(model), level=0.9))[2,1])
  hi<-ifelse(is.character(model), NA, exp(confint(profile(model), level=0.9))[2,2])
  if (!is.na(low)&!is.na(hi)){
# confidence test @ 70% and 90% confidence
    pass <- ifelse(all(low<=1, hi>=1), "Maybe?", "Confident")
    if (pass=="Maybe?"){
      low <- exp(confint(profile(model), level=0.7))[2,1]
      hi <-exp(confint(profile(model), level=0.7))[2,2]
      pass <- ifelse(all(low<=1, hi>=1),"Uncertain", "Likely")
    }
# Get slope from negative binomial
    fitted <- unname(model$fitted.values)
    slope <- round((fitted[length(fitted)] - fitted[1])/
                     (max(x_var) - min(x_var)),2)
    intercept <- round((fitted[1]-min(x_var)*((fitted[length(fitted)] - fitted[1])/
                       (max(x_var) - min(x_var)))),2)
    years.for.trend <- sum(!is.na(y_var))
    data.frame(slope=slope,intercept=intercept,CATTrend=pass,years.for.trend=years.for.trend)
  }else{
    "Error: Data cannot be fitted using negative binomial model"
    data.frame(slope=NA,intercept=NA,CATTrend=NA,years.for.trend=NA)
  }
}

################################################################################
#' bday_calculation
#'
#' @description
#' bday_calculation calculates the last and first bday for specified years from
#' the hydrometric record at a station.
#'
#' @param station a character string representing the station number that exists
#' in hydat
#' @param year a vecter of numbers representing the list of years used for calculating
#' bdays
#'
#' @return a dataframe of the format || station || year || lastbday || firstbday ||
#'
#' @details
#' bdays are days in the HYDAT hydrometric database with data symbol "B", which
#' represent ice conditions.
#'
#' @export
bday_calculation <- function(station, year="all"){
  flow.daily <- hy_daily_flows(station)
  flow.daily$Year <- as.numeric(format(flow.daily$Date, "%Y"))
  flow.daily$Month<- as.numeric(format(flow.daily$Date, "%m"))
  if (year=="all"){
    valid_year=unique(flow.daily$Year)
  }else if(any(!year %in% flow.daily$Year)){
    return("Some of the year provided are not in HYDAT")
    valid_year=year[(year %in% unique(flow.daily$Year))]
  }else{
    valid_year=year
  }
  first_vec = c()
  last_vec = c()
  for (i in valid_year){
    print(i)
# First B day
    flow.late <- flow.daily %>% filter(Year==i & Month > 8)
    firstbday <- flow.late %>% filter(Symbol == "B", Month >= 10) %>% head(1)
    firstbday.date <- ifelse(nrow(firstbday)>0, substring(firstbday$Date,1,10), NA)

    firstbday.long <- ifelse(is.na(firstbday.date),paste0(i,"-10-01"),
                             ifelse(firstbday.date > as.Date(paste0(i, "-10-01")),
                                    paste0(i, "-10-01"),firstbday.date))
    first_vec = c(first_vec, firstbday.long)
# Last B day
    flow.early <- flow.daily %>% filter(Year == i & Month < 6)
    lastbday <- flow.early %>% filter(Symbol == "B", Month < 6) %>% tail(1)
    lastbday.date <- ifelse(nrow(lastbday)>0, substring(lastbday$Date,1,10), NA)

    lastbday.long<- ifelse(is.na(lastbday.date),paste0(i,"-05-01"),
                           ifelse(lastbday.date < as.Date(paste0(i, "-05-01")),
                                  paste0(i,"-05-01"),lastbday.date))
    last_vec = c(last_vec, lastbday.long)
  }
  ret = data.frame(Station=rep(station, length(valid_year)),
                   Year = valid_year,
                   LastBday = last_vec,
                   FirstBday = first_vec)
  return(ret)
}

################################################################################
#' Threshold_Build
#'
#' @description
#' Threshold_Build calculates streamflows at high and low percentiles identified
#' as thresholds (hi & low) based on streamflow during a reference period, between
#' year_from and year_to, for a list of stations. Only streamflow during the ice
#' free period of the reference period, as identified with the bday_calculation
#' function, is used in the calculation of the lower threshold.
#'
#'
#' @param stations a vector of characters indicating the list of stations we want
#' to calculate the thresholds for
#' @param yfrom a number indicating the year threshold calculation starts,
#' defaulted to 1981;
#' @param yto a number indicating the year threshold calculation ends, defaulted
#' to 2010;
#' @param hi a number indicating the upper threshold, defaulted to 0.95;
#' @param low a number indicating the lower threshold, defaulted to 0.05
#' @param write_csv a boolean value indicating whether or not the output will be
#' written to a csv file, defaulted to FALSE;
#'
#' @return
#' A dataframe of the form || station || UpperThreshold || LowerThreshold ||
#'
#' @export
Threshold_Build <- function(stations, yfrom=1981, yto=2010, hi=0.95, low=0.05,
                            write_csv=FALSE){
  Upper = c()
  Lower = c()
  for (i in 1:length(stations)){
    stn.id <- stations[i]
    print(stn.id)
    flow.daily <- hy_daily_flows(stn.id)
    flow.daily$Year = as.numeric(substr(flow.daily$Date, 1, 4))
    ref <- flow.daily %>% filter(Year>=yfrom, Year<=yto)
    summ = ref %>% group_by(Year) %>% summarise(count=length(Value[!is.na(Value)]))
    if (length(summ$Year)<20 | length(summ$count[summ$count>=150])<length(summ$count)){
      Upper=append(Upper, NA)
      Lower=append(Lower, NA)
      print(paste0(stn.id, "---not enough data"))
    }else{
      Upper_All=ref$Value
      for (i in 1:length(Upper_All)){
        if (is.na(Upper_All[i])){
          Upper_All[i]=0
        }
      }
      yr=summ$Year
      num_leap_yrs=length(yr[as.numeric(yr)%%4==0])
      total_length=length(yr)*365+num_leap_yrs
      Upper_All=append(Upper_All, rep.int(0, total_length-length(Upper_All)))
      Lower_All=c()
      # First B day
      flow.daily$Month = as.numeric(format(flow.daily$Date, "%m"))
      for (j in 1:length(summ$Year)){
        year.op=summ$Year[j]
        fbday=bday_calculation(stn.id, year.op)
        lastbday = (fbday %>% filter(Year==year.op))[["LastBday"]]
        firstbday = (fbday %>% filter(Year==year.op))[["FirstBday"]]
        Value_within=(ref %>% filter(as.Date(Date)>=as.Date(lastbday)&a
                                     s.Date(Date)<=as.Date(firstbday)))$Value
        Value_within=Value_within[Value_within!=0]
        Lower_All=append(Lower_All, Value_within)
      }
      Upper=append(Upper,quantile(Upper_All, probs = hi, names = FALSE, na.rm = TRUE))
      Lower=append(Lower,quantile(Lower_All, probs = low, names = FALSE, na.rm = TRUE))
      print(stn.id)
    }
  }
  Threshold = data.frame(STATION_NUMBER = stations, Q_Upper=Upper, Q_Lower=Lower)
  if (write_csv){
    write.csv(Threshold, file = "./Output/Threshold_New.csv", row.names = FALSE)
  }
  return(Threshold)
}

################################################################################
#' percentile.ranked
#'
#' @description
#' Calculates the percentage of values in a vector smaller than value
#'
#' @param a.vector a vector of any comparable data type, ideally numeric;
#' @param value a comparable data type to the components in a.vector, ideally
#' numeric
#'
#' @return the percentage of values within the vector that are smaller than the
#' specified value
#'
#' @export

percentile.ranked <- function(a.vector, value) {
  numerator <- length(sort(a.vector)[a.vector < value])
  denominator <- length(a.vector)
  round(numerator/denominator,2)*100
}


################################################################################
#' Get_ann_mean_flow
#'
#' @description
#' Get_ann_mean_flow calculates the annual mean flow based at a station for
#' specified years.
#'
#' @param station a character string indicating the station number to calculate
#' the annual mean flow
#' @param year a vector indicating the list of years when the annual mean flow is
#' calculated. Default is 'all' indicating all recorded years at the station
#'
#' @return
#' a dataframe of the form || station || Year || annual mean flow ||
#'
#' @export

Get_ann_mean_flow <- function(station, year='all'){
  ret=data.frame()
  mean_flow = c()
  flow.daily <- hy_daily_flows(station)
  flow.daily$Year <- as.numeric(format(flow.daily$Date, "%Y"))
  if (year=="all"){
    valid_year=unique(flow.daily$Year)
  }else if(any(!year %in% flow.daily$Year)){
    return("Some of the year provided are not in HYDAT")
    valid_year=year[(year %in% unique(flow.daily$Year))]
  }else{
    valid_year=year
  }
  for (i in valid_year){
    flow.dates = flow.daily[flow.daily$Year==i,]
    if (nrow(flow.dates %>% filter(!is.na(Value)))<150){
      ann_mean_flow = NA
    }else{
      if (nrow(flow.dates%>%filter(!is.na(Value))) > 0.9 *
          ifelse(flow.dates$Year[1]%%4==0, 366, 365)){
        ann_mean_flow <- flow.dates %>% mutate(mean_flow = mean(Value, na.rm = TRUE))
        ann_mean_flow <- ann_mean_flow[1, "mean_flow"] %>% round(2)
        ann_mean_flow <- ann_mean_flow$mean_flow
      } else {
        Mode <- flow.daily  %>% group_by(Year) %>% filter(!is.na(Value)) %>%
          summarise(MODE=names(sort(table(Value), decreasing = TRUE)[1]))
        Mode.multi <- mean(as.numeric(Mode$MODE))
        if (Mode.multi <0.01){
          ann_mean_flow <- flow.dates %>% filter(!is.na(Value)) %>% group_by(Year) %>%
            summarise(TOTAL=sum(Value)/n())
          ann_mean_flow <- round(ann_mean_flow$TOTAL,2)
        } else {
          ann_mean_flow <- NA
        }
      }
    }
    mean_flow <- c(mean_flow, ann_mean_flow)
  }
  print(station)
  ret = data.frame(STATION_NUMBER=rep(station, length(mean_flow)),
                   Year = valid_year,
                   ann_mean_flow = mean_flow)
  return(ret)
}

################################################################################
#' Get_flood_metrics
#'
#' @description
#' Get_flood_metrics calculates four flood metrics at a specified hydrometric station
#' for a range of years specified:
#'   pot_threshold: the upper threshold calculated in Threshold_Build function;
#'   pot_days: days with streamflow over the threshold within a year;
#'   pot_events: independent flood events over a year;
#'   pot_max_dur: the maximum consecutive days over the threshold within a year
#' The four metrics are recorded in a table and returned, along with the maximum
#' flow in each year.
#'
#' @param station a character string representing a station we want the flood
#' metrics to be calculated
#' @param year a vector representing the list of years for which we want the flood
#' metrics to be calculated. Defaulted to 'all', indicating all the years with
#' data at the station
#'
#' @return
#' A dataframe of the form || STATION_NUMBER || Year ||pot_threshold  || pot_days
#'                            || pot_events || pot_max_dur || X1_day_max ||
#'
#' @details
#' POT stands for peaks-over-threshold.
#' Note that there are requirements regarding the independence conditions for
#' pot_event metric; we implement a check for the correlation between flood events.
#' Referring to Lang et al. (1999), we set two conditions:
#'   1) R > 5+log(A/1.609^2); and
#'   2) Xmin < 0.75 * min(Xi, Xj).
#'   where: R is the time in days between two peaks
#'          A is the watershed area in squared kilometers
#'          Xmin is the minimum flow between the adjacent flow peaks Xi and Xj
#' In order for two adjacent peaks to be considered as independent flood events,
#' both conditions need to be met. If either of the conditions fail, we consider
#' the two events as the same one. R is defined within the iteration.
#'
#' @references
#' Slater et al.(2016). On the impact of gaps on trend detection un extreme
#' streamflow time series. Int.J.climatol. 37:3976-3983(2017). doi: 10.1002/joc.4954
#'
#'#' @export
Get_flood_metrics <- function(station, year='all'){
  ret = data.frame()
  flow.daily <- hy_daily_flows(station)
  flow.daily$Year <- as.numeric(format(flow.daily$Date, "%Y"))
  if (year=="all"){
    valid_year=unique(flow.daily$Year)
  }else if(any(!year %in% unique(flow.daily$Year))){
    return("Some of the year provided are not in HYDAT")
    valid_year=year[(year %in% unique(flow.daily$Year))]
  }else{
    valid_year=year
  }
  station_row = Threshold_Build(station)
  max_flow = c()
  pot_days = c()
  pot_events = c()
  pot_threshold = c()
  pot_max_dur = c()
  hydat_info <- hy_stations(station)
  stn_area <- hydat_info$DRAINAGE_AREA_GROSS
  R <- 5 + log(stn_area/(1.609^2))%>%round(1)
  for (yr in valid_year){
    flow.dates = flow.daily %>% filter(Year == yr)
# do not calculate values for years with less than 150 days
    if (any((nrow(flow.dates %>% filter(!is.na(Value)))<150),is_empty(flow.dates$Date))){
      pot_days = c(pot_days, NA)
      pot_events =c(pot_events, NA)
      pot_threshold = c(pot_threshold, NA)
      pot_max_dur = c(pot_max_dur, NA)
      max_flow <- c(max_flow,NA)
    }else{
      max.flow <- flow.dates %>% arrange(desc(Value)) %>% slice(1)
      max.flow <- round(max.flow[["Value"]], digits = 2)
      max_flow <- c(max_flow,max.flow)
      if (is.na(station_row$Q_Upper)){
        pot_days = c(pot_days, NA)
        pot_events =c(pot_events, NA)
        pot_threshold = c(pot_threshold, NA)
        pot_max_dur = c(pot_max_dur, NA)
      }else{
        threshold <- as.numeric(station_row$Q_Upper)
# POT function triggers 'computational singularity' warning or error when threshold >=10
# This factor scales inputs and one of the outputs below to avoid the singularity.
        if (max.flow<threshold){
          pot_days = c(pot_days, 0)
          pot_events =c(pot_events, 0)
          pot_threshold = c(pot_threshold, threshold)
          pot_max_dur = c(pot_max_dur, 0)
        }else{
          factor <- case_when( threshold >= 1000 ~ 1000,
                             threshold >= 100 ~ 100,
                             threshold >= 10 ~ 10,
                             threshold >= 0 ~ 1)
          pot2 <- possibly(evir::pot, otherwise = NA)
          pot <- pot2(flow.dates$Value[!is.na(flow.dates$Value)]/factor, threshold/factor)

          if (!is.list(pot)) {
            pot_days = c(pot_days, NA)
            pot_events =c(pot_events, NA)
            pot_threshold = c(pot_threshold, NA)
            pot_max_dur = c(pot_max_dur, NA)

          } else {
            pot_days =c(pot_days, length(pot$data))
            pot.dates <- data.frame(dates = attr(pot$data, "times"))
            pot.dates$lag <- pot.dates$dates - lag(pot.dates$dates, 1)
            pot_threshold = c(pot_threshold, threshold)
            seq_blocks <- split(pot.dates$dates, cumsum(c(TRUE, diff(pot.dates$dates)!=1)))
# We iterate within seq_blocks to find maximum flow within each events, before testing
# their independence
            events_count <- c()
            for (a in 1:length(seq_blocks)){
              events_count <- append(events_count, length(seq_blocks[[as.character(a)]]))
            }
            events_array <- c()
            for (b in 1:length(events_count)){
              c <- events_count[b]
              while (c > 0){
                events_array<-append(events_array,b)
                c <- c-1
              }
            }
            pot_dat <- data.frame(data = pot$data, dates=attr(pot$data, "times"),event_num=events_array)
            max_peak <- pot_dat %>% group_by(event_num) %>% summarise(max=max(data))
            max_peak_central<-c()
            for (d in 1:length(max_peak$max)){
              max_peak_dates<-(pot_dat %>% filter(data==max_peak$max[d],dates>=seq_blocks[[d]][1],
                                                dates<=seq_blocks[[d]][length(seq_blocks[[d]])]))$dates
              if (length(max_peak_dates)%%2==1){
                max_peak_dates=max_peak_dates[(length(max_peak_dates)+1)/2]
              } else {
                max_peak_dates=(max_peak_dates[length(max_peak_dates)/2]+
                                max_peak_dates[length(max_peak_dates)/2+1])/2
              }
              max_peak_central=append(max_peak_central, max_peak_dates)
              max_peak_dates=c()
            }
            max_peak$central_dates<-max_peak_central
            R_vec<-c()
            for (t in 1:length(max_peak$max)){
              R_vec<-append(R_vec, R)
            }
            max_peak$R_vec<-R_vec
            Xmin_vec<-c()
            if (length(max_peak$max)==1){
              Xmin_vec<-append(Xmin_vec, NA)
            }else{
              for( e in 1:(length(max_peak$max)-1)){
                Xmin_vec<-append(Xmin_vec,
                             0.75*min(max_peak$max[e],max_peak$max[e+1])*factor)
              }
            Xmin_vec<-append(Xmin_vec, NA)
            }
            max_peak$Xmin_vec<-Xmin_vec
            ind_events<-c()
            if (length(seq_blocks)>1){
              for (j in 1:(length(seq_blocks)-1)){
                range_flow <- flow.dates %>% filter(Date>=as.Date(max_peak$central_dates[j], origin=paste0(yr, "-01-01")),
                                                     Date<=as.Date(max_peak$central_dates[j+1], origin=paste0(yr, "-01-01")))
                if (all(min(range_flow$Value, na.rm=TRUE)<Xmin_vec[j],
                        ((max_peak$central_dates[j+1] - max_peak$central_dates[j])>R))){
                  ind_events <- append(ind_events, 1)
                }else{
                  ind_events <- append(ind_events, 0)
                }
              }
              ind_events <- append(ind_events, 1)
            } else {
              ind_events <- c(1)
            }
            max_peak$ind_events<-ind_events
            pot_events <- c(pot_events, sum(ind_events))
            max_dur_count <- c()
            count<-0
            for (f in 1:length(ind_events)){
              if (ind_events[f]==1 | is.na(ind_events[f])){
                max_dur_count = append(max_dur_count, count + events_count[f])
                count = 0
              } else { # i.e. in_events[i] == 0
                count <- count + events_count[f]}
            }
            pot_max_dur  = c(pot_max_dur, max(max_dur_count))
          }
        }
      }
    }
  }
  ret = data.frame(STATION_NUMBER=rep(station,length(valid_year)),
                   Year=valid_year,
                   pot_threshold=pot_threshold,
                   pot_days=pot_days,
                   pot_events=pot_events,
                   pot_max_dur=pot_max_dur,
                   X1_day_max=max_flow)
  return(ret)
}

################################################################################
#' Get_drought_metrics
#'
#' @description
#' The function calculates four drought metrics at a specified hydrometric station
#' for a range of years specified:
#'   dr_threshold: the lower threshold calculated in Threshold_Build function;
#'   dr_days: days with streamflow below the threshold within a year;
#'   dr_events: independent drought events over a year;
#'   dr_max_dur: the maximum consecutive days under the threshold within a year;
#' The four metrics are recorded in a table and returned, along with the 7-day
#' minimum flow in each year.
#'
#' @param station a character string representing a station where we want the drought
#' metrics to be calculated
#' @param year a vector representing the list of years for which we want the drought
#' metrics to be calculated. Defaulted to 'all', indicating all the years with data
#' at the station
#'
#' @return
#' A dataframe of the form || STATION_NUMBER || Year || dr_threshold || dr_days ||
#'                             dr_events  || dr_max_dur || X7_day_min ||
#'
#' @details
#' The four metrics are calculated only between the last and first bday of each
#' year, as calculated with the bday_calculation function.
#' In order to ensure independence between drought events, we set the following
#' conditions:
#'   1) t >= 5; and
#'   2) the excess volume, Vabove / min(Vbelowi, Vbelowj)>=0.1.
#'   where: t is the time between events in days
#'          Vabove is the volume exceeding the threshold
#'          Vbelowi and Vbelowj are the volumes below the threshold during
#'            events i and j.
#' For two adjacent troughs to be considered as independent drought events,
#' both conditions need to be met. If either of the conditions fail, we consider
#' the two events as the same one.
#'
#' @references
#' WMO World Meteorological Organization (2008). Manual on Low-Flow Estimation
#' and Prediction. 1339 Operational Hydrology.
#'
#' @export

Get_drought_metrics <- function(station, year='all'){
  ret = data.frame()
  flow.daily <- hy_daily_flows(station)
  flow.daily$Year <- as.numeric(format(flow.daily$Date, "%Y"))
  if (year=="all"){
    valid_year=unique(flow.daily$Year)
  }else if(any(!year %in% unique(flow.daily$Year))){
    return("Some of the year provided are not in HYDAT")
    valid_year=year[(year %in% unique(flow.daily$Year))]
  }else{
    valid_year=year
  }
  station_row = Threshold_Build(station)
  bday = bday_calculation(station, valid_year)
  min7days = c()
  dr_days = c()
  dr_events = c()
  dr_threshold = c()
  dr_max_dur = c()
  for (yr in valid_year){
    bd=bday %>% filter(Year==yr)
    flow.dates = flow.daily %>% filter(Year == yr)
    dates_value=flow.dates$Value
# do not calculate values for years with less than 150 days of data
    if (nrow(flow.dates %>% filter(!is.na(Value)))<150){
      min7days <- c(min7days,NA)
      dr_days <- c(dr_days,NA)
      dr_events <- c(dr_events,NA)
      dr_max_dur <- c(dr_max_dur,NA)
      dr_threshold <- c(dr_threshold,NA)
    }else{
      if (is.na(flow.dates$Value[1])){
        front_ind=0
        while(is.na(dates_value[1])){
          dates_value=dates_value[2:length(dates_value)]
          front_ind=front_ind+1
        }
        date_ref_front=flow.dates$Date[front_ind+1]
        flow.dates=flow.dates %>% filter(as.Date(Date)>=as.Date(date_ref_front))
      }
      if (is.na(dates_value[length(dates_value)])){
        end_ind=0
        while(is.na(dates_value[length(dates_value)])){
          dates_value=dates_value[1:(length(dates_value)-1)]
          end_ind=end_ind+1
        }
        date_ref_end=flow.dates$Date[length(flow.dates$Value)-end_ind]
        flow.dates=flow.dates %>% filter(as.Date(Date)<=as.Date(date_ref_end))
      }
      date_vec<-flow.dates$Date
      flow.new<-flow.dates%>%mutate(Date=seq(1, n()))%>%mutate(Value=approx(Date,Value,Date)$y)
      flow.new$Date<-date_vec
      zooflow <- zoo(flow.new$Value, flow.new$Date)
# calculate 7-day minimum flow
      zooflow.summ <- window(zooflow, start = as.Date(bd$LastBday),
                           end = as.Date(bd$FirstBday))
      daymean7summ <- rollmean(zooflow.summ, 7)
      daymin7summ <- which.min(daymean7summ)
      if (length(daymin7summ) > 0){
        min7days <- c(min7days,round(daymean7summ[[daymin7summ]],2))
      } else {
        min7days <- c(min7days,NA)
      }
# if loop to stop calculations if there is no threshold
      if (is.na(station_row$Q_Lower)){
        dr_days = c(dr_days, NA)
        dr_events =c(dr_events, NA)
        dr_threshold = c(dr_threshold, NA)
        dr_max_dur = c(dr_max_dur, NA)
      }else{
        if (length(zooflow.summ) > 60){
          low <- as.numeric(station_row$Q_Lower)
          data.s <- data.frame(day= as.numeric(substring(flow.new[["Date"]], 9,10)),
                           month=as.numeric(substring(flow.new[["Date"]],6,7)),
                           year=yr,flow=flow.new$Value, hyear=yr,
                           baseflow=NA,
                           date=paste0(yr,"-",substring(flow.new[["Date"]],6,11)))

          data.s <- data.s %>% filter((as.Date(data.s$date) >= as.Date((bd[,3])[1])) &
                                    (as.Date(data.s$date) <= as.Date((bd[,4])[1])))

          data.lf <- createlfobj(data.s)
          flowunit(data.lf) <- "m^3/s"
          drought <- find_droughts(data.lf, threshold = low)
          summary2 <- possibly(base::summary, otherwise = data.frame(event.no=NULL))
          duration<-summary2(drought)

          if (any(duration[1,"event.no"] == 0, nrow(duration)==0)){
            dr_days  = c(dr_days,0)
            dr_events  = c(dr_events, 0)
            dr_max_dur =c(dr_max_dur, 0)
          } else {
            dr_days <- c(dr_days,sum(duration$duration))
            vol<-c()
            for (i in 1:length(drought$def.increase)){
              vol<-append(vol, drought$def.increase[[i]])
            }
            drevent_count<-c()
            zero_count<-c()
            for ( i in 1:length(drought$event.no)){
              drevent_count<-append(drevent_count, drought$event.no[[i]])
              if (drought$event.no[[i]]==0){
                zero_count<-append(zero_count, i)
              }
            }
# account for fact some years start with drought
#            if (drought$event.no[1]==1) {
#              ind <- 1
#            } else {
              ind<-0
#            }
            if (length(zero_count)==0){ #i event that last for the entire period
              dr_events=c(dr_events,1)
              dr_max_dur=c(dr_max_dur,duration$duration)
            } else {
              if (length(zero_count)==1){
                drevent_count[zero_count[1]]<-"0_1"
              } else {
                for ( i in 1:(length(zero_count)-1)){
                  if ((zero_count[i]+1)==zero_count[i+1]){
                    drevent_count[zero_count[i]]<-paste0(as.character(ind), "_", as.character(ind+1))
                  } else {
                    drevent_count[zero_count[i]]<-paste0(as.character(ind), "_", as.character(ind+1))
                    ind<-ind+1
                  }
                  drevent_count[zero_count[length(zero_count)]]<-paste0(as.character(ind), "_", as.character(ind+1))
                }
              }
              dr_filter<-data.frame(volume=vol, dr_event_count=drevent_count)
              dr_summary<-dr_filter %>% group_by(dr_event_count) %>% summarise(sum=sum(volume))
              dr_ind_events<-c()
              if (nrow(duration)>1){
                for ( i in 1:(nrow(duration)-1)){
                  t <- as.numeric(duration$start[i+1]-duration$end[i])
                  v_above<-(dr_summary %>% filter(dr_event_count==paste0(as.character(i), "_",
                                                                     as.character(i+1))))$sum
# account for cases where v_above can't be calculated
                  if (any(is.na(v_above),is_empty(v_above))) {
                    dr_ind_events<-append(dr_ind_events, 0)
                  } else {
                    if (all(t>=5, abs(v_above/min(duration$volume[i], duration$volume[i+1]))>=0.1)){
                      dr_ind_events<-append(dr_ind_events, 1)
                    } else {
                      dr_ind_events<-append(dr_ind_events, 0)
                    }
                  }
                }
                dr_ind_events<-append(dr_ind_events, 1)
              } else {
                dr_ind_events<-c(1)
              }
              dr_events <- c(dr_events,sum(dr_ind_events))
              dr_max_dur_count <- c()
              dr_count<-0
              for (i in 1:length(dr_ind_events)){
                if (dr_ind_events[i]==1){
                  dr_max_dur_count <- append(dr_max_dur_count, dr_count + duration$duration[i])
                  dr_count <- 0
                } else { # i.e. in_events[i] == 0
                  dr_count <- dr_count + duration$duration[i]
                }
              }
              dr_max_dur <- c(dr_max_dur,max(dr_max_dur_count))
            }
          }
          dr_threshold <- c(dr_threshold,(round(station_row$Q_Lower,2)))
        } else {
          dr_days <- c(dr_days,NA)
          dr_events <- c(dr_events,NA)
          dr_max_dur <- c(dr_max_dur,NA)
          dr_threshold <- c(dr_threshold,NA)
        }
      }
    }
  }
  ret = data.frame(STATION_NUMBER=rep(station,length(valid_year)),
                   Year=valid_year,
                   dr_threshold=dr_threshold,
                   dr_days=dr_days,
                   dr_events=dr_events,
                   dr_max_dur=dr_max_dur,
                   X7_day_min=min7days)
  return(ret)
}

################################################################################
#' chk.hydat
#'
#' @description
#' Function to define where to look for HYDAT database, and import it if necessary,
#' so it can be accessed through functions from the tidyhydat package.
#'
#' @param hydat.path a character string indicating the path for the HYDAT sqlite3
#' file relative to the current working directory. Defaulted to NULL, in which
#' case the file will be in the folder "./Dependencies"
#'
#' @return
#' downloading the latest version of the HYDAT database into a folder and setting
#' the database path
#'
#' @export

chk.hydat <- function(hydat.path = NULL){
# Find the current working directory, check if it has a folder for "Dependencies",
# and if not, create it.
  main_dir <- getwd()
  hydat.path <- "./Dependencies"
  if(length( grep("Dependencies", list.files(main_dir)))==0){
    dir.create(file.path(main_dir, hydat.path))
  }
  hy_file <- paste0(hydat.path,"/Hydat.sqlite3")

# Check if the HYDAT database is already in the dependencies folder, and if not,
# download it.
  if(length( grep("Hydat.sqlite", list.files(hydat.path)))==0){
    print(paste("No hydat database found in", hydat.path))
    print(paste("It will be downloaded to", hydat.path))
    download_hydat(dl_hydat_here = hydat.path)
  } else {
    print(paste("Hydat database found in", hydat.path))
  }
# set the path to database
  hy_set_default_db(hydat_path = hy_file)
}
sarahforte/CESIfun documentation built on Dec. 22, 2021, 10:16 p.m.