R/brf_level_shift.R

Defines functions set_level_shift find_level_shift gap_fill get_fit_summary pree formula_from_recipe gap_fill2 get_shift get_level_shift_coef level_shift_to_datetime get_intercept_stats add_level_shifts stretch_interp find_gaps fit_gaps predict_gaps

Documented in add_level_shifts find_gaps find_level_shift fit_gaps formula_from_recipe gap_fill gap_fill2 get_fit_summary get_intercept_stats get_level_shift_coef get_shift level_shift_to_datetime predict_gaps set_level_shift stretch_interp

# THESE FUNCTIONS ARE BRITTLE ---------------------------------------------



#' set_level_shift
#'
#' Create a factor column with the data subseted according to break points.
#' 
#' @param x vector of numeric
#' @param level_breaks break points in the data
#'
#' @return vector of type factor 
#' @export
#'
#' @examples
#' set_level_shift(1:100, c(2, 40, 86))
set_level_shift <- function(x,
                            level_breaks = NULL) {
  
  br <- c(min(x, na.rm = TRUE),
          level_breaks,
          max(x, na.rm = TRUE))
  attr(br, "tzone") <- "UTC"
  cut(x, 
      breaks = br,
      include.lowest = TRUE)
}



#' find_level_shift
#'
#' @param x the data set (data.table)
#' @param dep_var the dependent variable name (character)
#' @param time_var the time variable name (character)
#' @param time_interval the delta t (numeric)
#'
#' @return data.table of gaps
#' @export
#'
find_level_shift <- function(x,
                             dep_var = 'val', 
                             time_var = 'datetime', 
                             time_interval = 1) {

  midpoint <- NULL
  
  # remove times with no values
  tms <- x[!is.na(get(dep_var))][[time_var]]
  
  # find locations of gaps
  wh  <- which(diff(as.numeric(tms)) != time_interval)
  n   <- length(tms)
  
  subs <- list()
  
  if (length(wh) == 0) {
    subs <- data.table(start = tms[1], end = tms[n])
  } else {
    for (i in 1:(length(wh))) {
      
      start <- tms[(wh[i])]
      end   <- tms[(wh[i]) + 1]
      
      # make sure it is even 
      if((as.numeric(start)-as.numeric(end)) %% 2 != 0) {
        end <- end + 1
      }
      
      subs[[i]] <- data.table(start     = start, 
                              end       = end,
                              start_val = x[get(time_var) == start][[dep_var]],
                              end_val   = x[get(time_var) == end][[dep_var]])
    }
  }
  
  subs <- rbindlist(subs)

  subs[, midpoint := as.POSIXct(round((as.numeric(start) + as.numeric(end)) / 2), 
                                origin = '1970-01-01', tz = 'UTC')]
  
  return(subs)
  
}





#' gap_fill
#'
#' @param x data.table of water levels
#' @param gaps data.table of gaps
#' @param recipe recipe to apply
#' @param dep_var the time variable name (character)
#' @param time_var the time variable name (character)
#' @param time_interval the delta t (numeric)
#' @param buffer_start how much buffer on each side of gap
#' @param buffer_end how much buffer on each side of gap
#' @param max_interp largest gap to interpolate
#' @param include_level_shift include level shift in adjustment
#'
#' @return data.table of predictions
#' 
#' @export
#'
gap_fill <- function(x, 
                     gaps, 
                     recipe, 
                     dep_var = 'wl',
                     time_var = 'datetime',
                     time_interval = 1L, 
                     buffer_start = 86400 * 4,
                     buffer_end = 86400 * 4,
                     max_interp = 86400 * 7,
                     include_level_shift = TRUE) {
  
  # gaps <- find_level_shift(x, dep_var = dep_var, time_var = time_var,
  #                          time_interval = time_interval)
  
  if (nrow(gaps) == 0) {
    stop('no gaps to fill')
    return(NULL)
  }
  
  gaps <- gaps[as.numeric(end) - as.numeric(start) < max_interp]
  
  if (nrow(gaps) == 0) {
    warning('no gaps to fill')
    return(NULL)
  }
  
  gaps[, start_reg := start - (buffer_start)]
  gaps[, end_reg   := end + (buffer_end)]
  
  x <- copy(x)
  
  gaps <- gaps[, get_fit_summary(x, recipe, start_reg, end_reg, start, end),
       by = list(midpoint = as.POSIXct(midpoint, tz = 'UTC'),
                 start_gap = as.POSIXct(start, tz = 'UTC'), 
                 end_gap = as.POSIXct(end, tz = 'UTC'),
                 start_val = start_val, 
                 end_val = end_val)]
  
  
  if(include_level_shift) {
    
    gaps <- get_intercept_stats(gaps)
  
    gaps[, start_val := start_val - cumsum(c(0, sh[-length(sh)]))]
    gaps[, end_val := end_val - cumsum(sh)]
    
  }
  
  gaps[, to_exclude := nrow(predict_adj[[1]]) == 0, by = midpoint]
  
  gaps[to_exclude == FALSE, `:=` (predict_adj =
                 list(data.table(datetime = predict_adj[[1]][['datetime']],
                                 adj = stretch_interp(start_val[1],
                                                      end_val[1],
                                                      predict_adj[[1]][['predict_adj']])))),
       by = midpoint]


  
}





#' get_fit_summary
#'
#' @param x data.table of water levels
#' @param recipe recipe to apply
#' @param start_reg start subset time
#' @param end_reg end subset time
#' @param start_gap start subset time
#' @param end_gap end subset time
#'
#' @return data.table summary
#' @export
#'
#'
#'
get_fit_summary <- function(x, recipe, start_reg, end_reg, start_gap, end_gap) {
  
  y   <- x[between(datetime, start_reg, end_reg)]
  
  dat <- recipe %>%
    prep(training = y) %>%
    portion()
  
  form <- formula_from_recipe(recipe = recipe)
  fit <- lm(form, 
            dat[!is.na(as.vector(dat$outcome)),],  
            x = FALSE, y = FALSE, tol = 1e-50)
  
  # get predictions without level shift
  dat_sub <- dat[is.na(as.vector(dat$outcome)),]
  pt  <- data.table(level_shift = as.numeric(predict(fit, dat_sub, type = 'terms', terms = 'level_shift')),
                    datetime = as.POSIXct(as.numeric(dat_sub[['datetime']]), origin = '1970-01-01', tz = 'UTC'))
  pt[, predict := predict(fit, dat_sub)]
  pt[, level_shift := level_shift - level_shift[1]]
  pt[, predict_adj := predict - level_shift]
  pt <- pt[between(datetime, start_gap, end_gap)]
  pt[, level_shift := NULL]
  pt[, predict := NULL]
  
  out <- summarize_lm(fit)
  #out[, `:=` (rank = fit$rank)]
  out[, start_reg := start_reg]
  out[, end_reg := end_reg]
  out[, `:=` (recipe = list(recipe))]
  #out[, `:=` (coef = list(summarize_coef(fit)))]
  out[, `:=` (formula = list(form))]
  out[, `:=` (terms = list(terms))]
  out[, `:=` (pivot = list(qr(fit)$pivot))]
  out[, `:=` (predict_adj = list(pt))]
  
  out
  
}




pree <- function(x, fit_dt) {
  
  dat <- recipe %>%
    prep(training = x) %>%
    portion()
  
  terms <- labels(terms(fit_dt$form))
  
  term_names <- lapply(select(dat, terms), colnames)
  
  lapply(term_names, function(x) {
    select
  })
  
  terms <- names(dat)
  
}






#' formula_from_recipe
#' guess formula from recipe
#'
#' @param recipe recipe to apply
#'
#' @return formula
#' @export
#'
formula_from_recipe <- function(recipe) {
  
  terms <- na.omit(
    vapply(recipe$steps, FUN = function(x) {
      as.character(x$role)
    }, FUN.VALUE = 'character'))
  terms <- terms[terms != 'exclude']
  rhs <- paste(terms, collapse = '+')
  
  if('level_shift' %in% terms) {
    add <- '-1'
    rhs <- paste0(rhs, add)
  }
  
  as.formula(paste0('outcome', '~', rhs))
}


#' gap_fill2
#'
#' @param x gap data.table 
#' @param y interpolated values
#'
#' @return data.table with gap values adjusted for range
#' @export
#'
gap_fill2 <- function(x, y) {
  
  x <- x[y, on = 'midpoint']
  
  x[, list(interp = stretch_interp(start_val[1], 
                                   end_val[1], 
                                   predict_adj[between(datetime, start[1], end[1])]),
           datetime = datetime[between(datetime, start[1], end[1])]), 
       by = list(midpoint)]

}


#' get_shift
#'
#' @param x data.table of water levels
#' @param recipe recipe to apply
#' @param start start subset time
#' @param end end subset time
#'
#' @return data.table of predictions
#' @export
#'
get_shift <- function(x, recipe, start, end) {
  
  y <- x[between(datetime, start, end)]

  dat <- recipe %>%
    prep(training = y) %>%
    portion()
  
  fit <- lm(outcome~distributed_lag + lag_earthtide + datetime + level_shift -1, 
            dat,  
            x = FALSE, y = FALSE, tol = 1e-50)
  
  #print(summary(fit))
  
  pt <- as.data.table(predict(fit, dat, type = 'terms', terms = 'level_shift'))
  pt[, predict := predict(fit, dat)]
  pt[, level_shift := level_shift - level_shift[1]]
  pt[, datetime := as.POSIXct(dat$datetime, origin = '1970-01-01', tz = 'UTC')]
  pt[, predict_adj := predict - level_shift]
  pt
  
}


# x is the result of gap_fill
#' get_level_shift_coef
#'
#' @param x data.table of level shifts from regression
#'
#' @return data.table of shifts
#' @export
#'
get_level_shift_coef <- function(x) {
  
  co <- x[, (coefs[[1]]), by = list(start_reg, end_reg, start_gap, end_gap, midpoint)]
  # print(co, 100)
  co <- co[grep('level_shift', name)]
  setnames(co, 'Estimate', 'level_shift')
  
  # there shouldn't be any NA values but remove them if there are
  co <- co[!is.na(level_shift)]
  
  co 
  
}


# x is the result of gap_fill
#' level_shift_to_datetime
#'
#' @param x character string that includes the date
#'
#' @return datetime
#' @export
#'
level_shift_to_datetime <- function(x) {
  
  # Remove non-numeric
  x <- gsub("[^0-9.-]", "", x)
  datetime <- paste(scan(text = x, sep = '.', quiet = TRUE), collapse = ' ')
  
  as.POSIXct(datetime, 
             format = '%Y %m %d %H %M %S', 
             origin = '1970-01-01',
             tz = 'UTC')
  
}


#' get_intercept_stats
#'
#' @param x data.table of level shifts from regression
#'
#' @return data.table of shifts
#' @export
#'
get_intercept_stats <- function(x) {
  
  # get level_shift from regression
  y <- get_level_shift_coef(x)
  
  # the name is for grouping coefficients
  y <- y[, list(shifts = unique(level_shift),
                name), 
          by = list(midpoint)]
  
  
  y[, shift_diff := c(0.0, diff(shifts)), by = midpoint]
  
  # 
  # mids <- unique(x[, list(shift_datetime = midpoint, 
  #                         end_toss = midpoint)])
  # 
  # rngs <- unique(x[, list(start = min_datetime,
  #                         end = max_datetime, 
  #                         midpoint)])
  # 
  # 
  # setkey(mids, shift_datetime, end_toss)
  # setkey(rngs, start, end)
  # grps <- foverlaps(rngs, mids)[, list(start, end, shift_datetime, midpoint, type = 'reg')]
  # 
  # grps <- grps[, rbind(data.table(shift_datetime = start[1], midpoint = midpoint[1], type = 'non-reg'), .SD),
  #              by = list(start, end)]
  # 
  # grps <- grps[, list(shift_datetime, midpoint, type)]
  # 
  # x <- cbind(x, shift_datetime = grps$shift_datetime)
  
  #return(x[midpoint == shift_datetime])
  
  # the first value should always be 0 and can be removed
  
  y <- y[, .SD[-1], midpoint]
  
  small_mag <- function(x) { x[which.min(abs(x))]}
  
  y <- y[, list(min  = min(shift_diff),
                max  = max(shift_diff),
                mean = mean(shift_diff),
                sd   = sd(shift_diff),
                sh   = small_mag(c(range(shift_diff), mean(shift_diff))),
                n = .N,
                midpoint = level_shift_to_datetime(name)),
     by = list(name)]
  
  y <- y[!(min == 0.0 & max == 0)]
  
  y <- x[y, on = 'midpoint']
  
  return(y)
}


#' add_level_shifts
#'
#' @param x datetimes
#' @param y shift vector
#'
#' @return vector of shifts
#' @export
#'
add_level_shifts <- function(x, y) {
  
  out <- rep(0.0, length(x))
  
  for(i in 1:nrow(y)) {
    wh <- which(x > y$midpoint[i])
    mn <- as.numeric(y[i, list(min, max, mean)])
    mn <- mn[which.min(abs(mn))]
    out[wh] <- out[wh] + mn
  }
  
  
  return(out)
  
}


#' stretch_interp
#'
#' @param start_val starting value to match
#' @param end_val end value to match
#' @param values set of values
#'
#' @return shifted vector of values
#' @export
#' 
stretch_interp <- function(start_val = NA, 
                           end_val = NA,
                           values) {
  
  shifta <- start_val - values[1]
  shiftb <- end_val - values[length(values)]
  
  if(is.na(start_val)){
    shifta <- shiftb
  } 
  if(is.na(end_val)){
    shiftb <- shifta
  } 
  
  values <- values + seq(shifta, shiftb, length.out = length(values))
  
  return(values)
}


# library(data.table)
# dat <- data.table(x = rnorm(100), y = rnorm(100), z = rnorm(100))
# dat[30:40, x := NA_real_]
# dat[50:70, z := NA_real_]
# dat[10:12, y := NA_real_]
# 
# fill_lm <- function(dat,
#                     fill_cols, 
#                     partial = FALSE) {
# 
#   dat <- copy(dat)
# 
#   if(length(fill_cols) < 2) {
#     stop('fill_cols must have length 2 or greater')
#   }
# 
#   for(i in seq_along(fill_cols)) {
# 
#     fill_col <- fill_cols[i]
#     ind_cols <- setdiff(fill_cols, fill_col)
# 
#     for(j in seq_along(ind_cols)) {
# 
#       for(k in seq_along(gaps)) {
# 
#         to_fill <- dat[is.na(get(fill_col))]
#         ind_col <- ind_cols[j]
#         n_na    <- sum(is.na(to_fill[[ind_col]]))
# 
#         if(n_na == 0) {
# 
#           form    <- as.formula(paste0(fill_col, '~', ind_col))
#           fit     <- lm(form, dat)
#           dat[is.na(get(fill_col)), (fill_col) := predict(fit, to_fill)]
#           break
#         }
#       }
#     }
#   }
# 
#   return(dat)
# 
# }


# library(data.table)
# library(recipes)
# library(waterlevel)
# 
# dt <- data.table(x = rnorm(100000), y = rnorm(100000), z = rnorm(100000), datetime = 1:100000)
# 
# recipe <- recipe(y+z~x, dt[1]) %>%
#   step_distributed_lag(x, knots = c(0, 2, 5)) %>%
#   step_mutate(test = x^2, role = 'squared')
# 
# tmp <- fit_gaps(dt, recipe = recipe)
# dtnew <- data.table(x = rnorm(1000), y = rnorm(1000), z = rnorm(1000))
# aa <- predict_gaps(dtnew, fits = tmp)


#' find_gaps
#'
#' @param x the data set (data.table)
#' @param dep_var the dependent variable name (character)
#' @param time_var the time variable name (character)
#' @param time_interval the delta t (numeric)
#'
#' @return data.table of gaps
#' @export
#'
find_gaps <- function(x,
                      dep_var = 'val', 
                      time_var = 'datetime', 
                      time_interval = 1) {
  
  midpoint <- NULL
  
  # remove times with no values
  tms <- x[!is.na(get(dep_var))][[time_var]]
  
  # find locations of gaps
  wh  <- which(diff(as.numeric(tms)) != time_interval)
  n   <- length(tms)
  
  subs <- list()
  
  if (length(wh) == 0) {
    subs <- data.table(start = tms[1], end = tms[n])
  } else {
    for (i in 1:(length(wh))) {
      
      start <- tms[(wh[i])]
      end   <- tms[(wh[i]) + 1]
      
      subs[[i]] <- data.table(start     = start, 
                              end       = end,
                              start_val = x[get(time_var) == start][[dep_var]],
                              end_val   = x[get(time_var) == end][[dep_var]])
    }
  }
  
  subs <- rbindlist(subs)
  
  subs[, midpoint := as.POSIXct((as.numeric(start) + as.numeric(end)) / 2, 
                                origin = '1970-01-01', tz = 'UTC')]
  
  return(subs)
  
}


#' fit_gaps
#'
#' @param x the data set (data.table)
#' @param recipe the recipe to apply
#' @param time_var the time variable name (character)
#'
#' @return data.table of fit results
#' @export
#'
fit_gaps <- function(x, recipe, time_var = 'datetime') {
  
  
  form <- formula_from_recipe(recipe)
  
  fit_dat <- recipe %>% 
    prep(training = x) %>% 
    portion()
  
  fit <- lm(formula = form, data = fit_dat,  
            x = FALSE, y = FALSE, tol = 1e-50)
  
  out <- summarize_lm(fit)
  #print(out)
  out[, `:=` (recipe      = list(recipe))]
  out[, `:=` (start_train = min(x$datetime))]
  out[, `:=` (end_train   = max(x$datetime))]
  out
  
}


#' predict_gaps
#'
#' @param x the data set (data.table)
#' @param fits data.table of fits
#' @param term_labels names of the group (character)
#'
#' @return data.table of predictions
#' @export
#'
predict_gaps <- function(x, fits, term_labels = NULL) {
  
  recipe      <- fits[['recipe']][[1]]
  
  if(is.null(term_labels)) {
    term_labels <- fits[['term_labels']][[1]]
  }
  
  coefs       <- fits[['coefs']][[1]]
  setkey(coefs, name)
  fit_dat <- recipe %>% 
    prep(training = x) %>% 
    portion()
  
  out <- matrix(NA_real_, 
                nrow = nrow(fit_dat), 
                ncol = length(term_labels))
  
  for (i in seq_along(term_labels)) {
    
    term_label <- term_labels[i]
    nms <- paste0(term_label, colnames(fit_dat[[term_label]]))
    
    out[, i] <- fit_dat[[term_label]] %*% as.matrix(coefs[nms, list(Estimate)])
  
  }

  out <- as.data.table(out)
  setnames(out, term_labels)
  
  out
  
}
jkennel/waterlevel documentation built on Dec. 1, 2019, 6:24 p.m.