R/analysis.R

Defines functions moving_avg summarize_lagged_cors roll_nls max_unc roll_corr get_outliers

## =============================================================================
#' Title
#'
#' @param x
#' @param n
#'
#' @return
#' @export
#'
#' @examples
moving_avg <- function(x, n = 5) {
  out <- stats::filter(x, rep(1 / n, n), sides = 2)
  as.numeric(out)
}

## =============================================================================
#' Title
#'
#' @param x 
#' @param y 
#' @param lag_max 
#' @param lag_seq 
#' @param method 
#' @param int_fun 
#' @param ... 
#'
#' @return
#' @export
#'
#' @examples
summarize_lagged_cors <- function(x, y, lag_max, lag_seq = 1, 
                                  method = c("discrete", "integrated"), 
                                  int_fun = mean, ...) {
  method <- match.arg(method)
  
  # Set vector of lags to be tested
  lags <- seq(0, lag_max, by = lag_seq)
  
  # Initialize output dataset using instantaneous correlation
  out <- broom::tidy(cor.test(x, y))
  
  # Discrete approach: y is correlated to x at the nth previous index
  if (method == "discrete") {
    for (i in 2:length(lags)) {
      x_lag <- dplyr::lag(x, lags[i])
      test <- cor.test(x_lag, y, ...) %>% broom::tidy()
      out <- dplyr::bind_rows(out, test)
    }
  } 
  
  # Integrated appraoch: y is correlated to a summary of x from nth previous 
  # index to present
  if (method == "integrated") {
    for (i in seq_along(lags)) {
      x_lag <- zoo::rollapplyr(x, width = lags[i], FUN = int_fun, fill = NA)
      test <- cor.test(x_lag, y, ...) %>% broom::tidy()
      out <- dplyr::bind_rows(out, test)
    }
  }
  
  # Add lag index to data frame, return
  out %>%
    dplyr::mutate(lag = lags) %>%
    dplyr::select(lag, dplyr::everything())
}

# THIS NEEDS HELP!!!!!
roll_nls <- function(data, a_frs, a_veg, a_wtr, norm = 0, frame = 1344,
                     screen = FALSE, method = c("gpp", "rref", "e0"),
                     rolltype = c("continuous", "window")) {
  method <- match.arg(method)
  rolltype <- match.arg(rolltype)
  is.error <- function(x) inherits(x, "try-error")
  # Map data onto a full year time vector
  timesteps <- data.frame(
    timestamp = create_timesteps(2018, shift_by = 30, tz = "Etc/GMT+5")
  )
  if (method == "gpp") {
    data <- data %>% dplyr::mutate(flux = GPP, env = PPFD)
  } else if (method %in% c("rref", "e0")) {
    data <- data %>% dplyr::mutate(flux = Reco, env = Tair)
  }
  df <- data %>%
    dplyr::left_join(timesteps, ., by = "timestamp") %>%
    dplyr::mutate(
      gnorm = seq(from = -norm, to = norm, length.out = nrow(timesteps)),
      a1 = exp(-gnorm^2) * a_frs,
      a2 = exp(-gnorm^2) * a_veg,
      a3 = exp(-gnorm^2) * a_wtr
    ) %>%
    dplyr::select(timestamp, flux, env, frs, veg, wtr, a1, a2, a3)
  
  side <- frame / 2
  start <- side + 1
  n <- nrow(df)
  end <- n - start
  seq <- seq(start, end, by = 1)
  if (rolltype == "window") {
    # Reichstein et al. (2005) method
    if (method == "e0") {
      frame <- 720
      side <- frame / 2
      start <- side + 1
      end <- nrow(timesteps) - start
      seq <- seq(start, nrow(timesteps), by = 240)
      n <- length(seq)
      end <- seq[n]
    } else if (method == "rref") {
      frame <- 192
      side <- frame / 2
      start <- side + 1
      end <- nrow(timesteps) - start
      seq <- seq(start, nrow(timesteps), by = frame)
      n <- length(seq)
      end <- seq[n]
    }
    
  }
  out <- data.frame(
    timestamp = rep(NA, n),
    g_frs = rep(NA, n), g_veg = rep(NA, n), g_wtr = rep(NA, n),
    se_frs = rep(NA, n), se_veg = rep(NA, n), se_wtr = rep(NA, n),
    p_frs = rep(NA, n), p_veg = rep(NA, n), p_wtr = rep(NA, n),
    a_frs = rep(NA, n), a_veg = rep(NA, n), a_wtr = rep(NA, n),
    fitted = rep(NA, n), resid = rep(NA, n),
    rsq = rep(NA, n), rmse = rep(NA, n), mae =  rep(NA, n)
  )
  if (method == "rref") {
    out <- out %>%
      dplyr::rename(
        r_frs = g_frs, r_veg = g_veg, r_wtr = g_wtr,
        e_frs = a_frs, e_veg = a_veg, e_wtr = a_wtr
      )
  } else if (method == "e0") {
    out <- out %>%
      dplyr::rename(
        e_frs = g_frs, e_veg = g_veg, e_wtr = g_wtr,
        r_frs = a_frs, r_veg = a_veg, r_wtr = a_wtr
      )
  }
  pbar <- if (rolltype == "continuous") {
    dplyr::progress_estimated(n - frame)
  } else dplyr::progress_estimated(n)
  for (i in seq_along(seq)) {
    temp <- df[(seq[i] - side):(seq[i] + side), ]
    # Pre-screening conditions
    if (all(is.na(temp$flux)) | all(is.na(temp$veg)) | all(is.na(temp$env))) {
      pbar$tick()$print()
      next
    } else if (method == "e0") {
      range <- range(temp$env, na.rm = TRUE)
      complete <- complete.cases(temp)
      #if (range[2] - range[1] <= 5 | sum(complete, na.rm = TRUE) <= 6) {
      #  pbar$tick()$print()
      #  next
      #}
    }
    # Run nonlinear regression
    if (method == "gpp") {
      mod <- try(nls(
        flux ~
          frs * ((g1 * -a_frs * env) / (a_frs * env + g1)) +
          veg * ((g2 * -a_veg * env) / (a_veg * env + g2)) +
          wtr * ((g3 * -a_wtr * env) / (a_wtr * env + g3)),
        start = list(g1 = 1, g2 = 1, g3 = 1), data = temp,
        control = nls.control(maxiter = 200)
      ), silent = TRUE)
    } else if (method == "rref") {
      mod <- try(nls(
        flux ~
          frs * r1 * exp(a_frs * (1 / (283.15 - 227.13) - 1 / (env - 227.13))) +
          veg * r2 * exp(a_veg * (1 / (283.15 - 227.13) - 1 / (env - 227.13))) +
          wtr * r3 * exp(a_wtr * (1 / (283.15 - 227.13) - 1 / (env - 227.13))),
        start = list(r1 = 1, r2 = 1, r3 = 1), data = temp,
        control = nls.control(maxiter = 200)
      ), silent = TRUE)
    } else if (method == "e0") {
      mod <- try(nls(
        flux ~
          frs * a_frs * exp(e1 * (1 / (283.15 - 227.13) - 1 / (env - 227.13))) +
          veg * a_veg * exp(e2 * (1 / (283.15 - 227.13) - 1 / (env - 227.13))) +
          wtr * a_wtr * exp(e3 * (1 / (283.15 - 227.13) - 1 / (env - 227.13))),
        start = list(e1 = 100, e2 = 100, e3 = 100), data = temp,
        control = nls.control(maxiter = 200)
      ), silent = TRUE)
    }
    # Gather model parameters if model is created without error
    if (is.error(mod)) {
      pbar$tick()$print()
      next
    } else {
      coefs <- summary(mod)$coefficients
      fitted <- fitted(mod)[start]
      resid <- resid(mod)[start]
      out[i, ] <- c(
        NA,
        coefs[1, 1], coefs[2, 1], coefs[3, 1],
        coefs[1, 2], coefs[2, 2], coefs[3, 2],
        coefs[1, 4], coefs[2, 4], coefs[3, 4],
        df$a1[i], df$a2[i], df$a3[i],
        fitted, resid, modelr::rsquare(mod, temp), modelr::rmse(mod, temp),
        modelr::mae(mod, temp)
      )
      if (method == "rref" & rolltype == "window") {
        out[i, "timestamp"] <- median(temp$timestamp[which(!is.na(temp$flux))])
      } else {
        out[i, "timestamp"] <- df$timestamp[seq[i]]
      }
      
    }
    pbar$tick()$print()
  }
  out$timestamp <- lubridate::as_datetime(out$timestamp, tz = "Etc/GMT+5")
  if (screen) {
    out[, 4] <- dplyr::if_else(
      out[, 4] > 0 & out[, 5] > 0 & out[, 6] > 0, out[, 4], NA_real_
    )
    out[, 5] <- dplyr::if_else(
      out[, 4] > 0 & out[, 5] > 0 & out[, 6] > 0, out[, 5], NA_real_
    )
    out[, 6] <- dplyr::if_else(
      out[, 4] > 0 & out[, 5] > 0 & out[, 6] > 0, out[, 6], NA_real_
    )
  }
  out
}

## =============================================================================
#' Calculate Maximum Uncertainty in a Parameterized Equation
#'
#' @param formula
#' @param vals
#' @param uncs
#' @param unc_type
#' @param ...
#'
#' @return
#' @export
#'
#' @examples
max_unc <- function(formula, vals, uncs, unc_type = c("corr", "uncorr"), ...) {
  unc_type <- match.arg(unc_type)
  vars <- all.vars(formula)
  if (any(is.character(vals))) {
    df <- as.data.frame(rlang::list2(...))
    colnames(df) <- suppressWarnings(vals[which(is.na(as.numeric(vals)))])
    constants <- vars[!vars %in% names(df)]
    for (i in seq_along(constants)) {
      val <- as.numeric(vals[which(vars == constants[i])])
      df[, constants[i]] <- rep(val, nrow(df))
    }
    df <- df[vars]
  } else {
    df <- data.frame()
    for (i in 1:length(vals)) {
      df[1, i] <- vals[i]
    }
    colnames(df) <- vars
  }
  partials <- deriv(formula, vars)
  evald <- as.data.frame(attr(eval(partials, df), "gradient"))
  for (i in 1:length(evald)) {
    evald[, i] <- abs(evald[, i]) * uncs[which(vars == names(evald)[i])]
    if (unc_type == "uncorr") evald[, i] <- evald[, i]^2
  }
  if (unc_type == "corr") {
    rowSums(evald)
  } else if (unc_type == "uncorr") {
    sqrt(rowSums(evald))
  }
}

## =============================================================================
#' Title
#'
#' @param x
#' @param y
#' @param n
#' @param zmax
#' @param ...
#'
#' @return
#' @export
#'
#' @examples
roll_corr <- function(x, y, n, zmax = 0.25 * n, ...) {
  df <- data.frame(x, y)
  df$z <- zoo::rollapply(
    data[, c("tair", "fch4")], n,
    function(x) length(x[!is.na(x[, 1]) & !is.na(x[, 2])]) / 2,
    by.column = FALSE, fill = NA
  )
  df$x <- dplyr::if_else(df$z < zmax, NA_real_, df$x)
  df$y <- dplyr::if_else(df$z < zmax, NA_real_, df$y)
  df$z <- NULL
  out <- zoo::rollapply(
    df, n, function(x) cor(x[, 1], x[, 2], use = "pairwise.complete.obs"),
    by.column = FALSE, fill = NA
  )
  out <- dplyr::if_else(out == 1, NA_real_, out)
  out[which(!is.na(out))][1:4] <- NA
  out
}

## =============================================================================
#' Title
#'
#' @param x
#' @param y
#' @param z
#' @param method
#'
#' @return
#' @export
#'
#' @examples
get_outliers <- function(x, y = NULL, z,
                         method = c("default", "mad", "doublemad", "iqr",
                                    "spikes", "cooks", "mahalanobis", "aq",
                                    "pcout")) {
  method <- match.arg(method)
  if (is.null(y)) {
    if (method %in% c("default", "mad")) {
      # default mad z = 7
      if (missing(z)) z <- 7
      min <- median(x, na.rm = TRUE) - mad(x, na.rm = TRUE) * z
      max <- median(x, na.rm = TRUE) + mad(x, na.rm = TRUE) * z
      !dplyr::between(x, min, max)
    } else if (method == "doublemad") {
      # default mad z = 7
      if (missing(z)) z <- 7
      abs_dev <- abs(x[!is.na(x)] - median(x[!is.na(x)]))
      left <- median(abs_dev[x[!is.na(x)] <= median(x[!is.na(x)])])
      right <- median(abs_dev[x[!is.na(x)] >= median(x[!is.na(x)])])
      x_mad <- dplyr::if_else(x < median(x, na.rm = TRUE), left, right)
      mad_dist <- abs(x - median(x, na.rm = TRUE)) / x_mad
      mad_dist > z
    } else if (method == "iqr") {
      # AKA Tukey's rule: default iqr z = 1.5
      if (missing(z)) z <- 1.5
      quant <- quantile(x, probs = c(.25, .75), na.rm = TRUE)
      iqr <- z * IQR(x, na.rm = TRUE)
      min <- quant[1] - iqr
      max <- quant[2] + iqr
      !dplyr::between(x, min, max)
    } else if (method == "spikes") {
      # x is flagged if it spikes beyond what is expected given variability for
      # time period
      if (missing(z)) z <- 7
      df <- data.frame(x = x)
      n <- 624
      #browser()
      df <- df %>%
        # calculate immediate differences, cut out 2-week windows
        dplyr::mutate(
          di = (x - dplyr::lag(x, 1)) - (dplyr::lead(x, 1) - x),
          window = ggplot2::cut_interval(dplyr::row_number(), length = n),
          window = forcats::fct_explicit_na(window)
        ) %>%
        # separate daytime and nighttime fluxes
        dplyr::group_by(window) %>%
        # calculate mad and median of the differences
        dplyr::mutate(
          med = median(di, na.rm = TRUE),
          mad = median(abs(di - med), na.rm = TRUE) * z / 0.6745,
          #mad = mad(di, na.rm = TRUE) * z / 0.6745,
          min = med - mad,
          max = med + mad
        )
      #!dplyr::between(df$di, df$min, df$max)
      df$di < df$min | df$di > df$max & !is.na(df$di)
    }
  } else {
    if (method %in% c("default", "cooks")) {
      # default cooks z = 4
      if (missing(z)) z <- 4
      cooksd <- cooks.distance(lm(x ~ y, na.action = na.exclude))
      lim <- mean(cooksd, na.rm = TRUE) * z
      cooksd > lim
    } else if (method == "mahalanobis") {
      # default mahalanobis z = 12
      if (missing(z)) z <- 12
      m_df <- dplyr::bind_cols(list(x, y))
      m_dist <- mahalanobis(
        m_df,
        colMeans(m_df, na.rm = TRUE),
        cov(m_df, use = "pairwise.complete.obs")
      )
      m_dist > z
    } else if (method == "aq") {
      # default z = 0.05
      # mvoutlier::aq()
    } else if (method == "pcout") {
      # mvoutlier::pcout() - $wfinal01: 0 = outlier, $wfinal: smaller values =
      # more likely to be outliers
    }
  }
}
grahamstewart12/fluxtools documentation built on Dec. 29, 2019, 10:53 a.m.