fingerprinting: Detection and Attribution Analysis of Climate Change

Documented in fingerprint

##' Optimal Fingerprinting via total least square regression.
##'
##' This function estimates the signal factors and corresponding confidence 
##' interval via the estimating equation or total least squares.
##'
##' @param Xtilde \eqn{n \times p} matrix, signal pattern to be detected.
##' @param Y \eqn{n \times 1} matrix, length \eqn{S \times T}, observed climate variable.
##' @param mruns number of ensembles to estimate the corresponding pattern. 
##' It is used as the scale of the covariance matrix for \eqn{X_i}.
##' @param ctlruns.sigma \eqn{m \times n} matrix, a group of \eqn{m} independent control 
##' runs for estimating covariance matrix, which is used in point estimation of 
##' the signal factors.
##' @param ctlruns.bhvar \eqn{m \times n} matrix, another group of \eqn{m} independent control 
##' runs for estimating the corresponding confidence interval of the signal factors, 
##' in EE or PBC approach should be same as ctlruns.sigma.
##' @param S number of locations for the observed responses.
##' @param T number of time steps for the observed responses.
##' @param B number of replicates in bootstrap procedure, mainly for the PBC and TS methods, can be 
##' specified in "EE" method but not necessary. By default B = 0 as the default method is "EE".
##' @param Proj The projection matrix for computing for scaling factors of other external forcings 
##' with the current input when using EE. For example, when ALL and NAT are used for modeling, 
##' specifying the Proj matrix to return the results for ANT and NAT.
##' @param method for estimating the scaling factors and corresponding confidence interval
##' @param cov.method method for estimation of covariance matrix in confidence 
##' interval estimation of PBC method. (only for PBC method).
##' @param conf.level confidence level for confidence interval estimation.
##' @param missing indicator for whether missing values present in Y.
##' @param cal.a indicator for calculating the a value, otherwise use default value a = 1. (only for EE method)
##' @param ridge shrinkage value for adjusting the method for missing observations if missing = TRUE. (only for EE method)
##' @return a list of the fitted model including point estimate and
##' interval estimate of coefficients and corresponding estimate of standard error.
##' @author Yan Li
##' @keywords regression tls fingerprinting estimating equation
##' @references \itemize{ 
##' \item  Gleser (1981), Estimation in a Multivariate "Errors in Variables" 
##' Regression Model: Large Sample Results, \emph{Ann. Stat.} 9(1) 24--44.
##' \item Golub and Laon (1980), An Analysis of the Total Least Squares Problem,
##' \emph{SIAM J. Numer. Anal}. 17(6) 883--893.
##' \item Pesta (2012), Total least squares and bootstrapping with 
##' applications in calibration, \emph{Statistics} 47(5), 966--991.
##' \item Li et al (2021), Uncertainty in Optimal Fingerprinting is Underestimated, 
##'       \emph{Environ. Res. Lett.} 16(8) 084043.
##' \item Sai et al (2023), Optimal Fingerprinting with Estimating Equations,
##'       \emph{Journal of Climate} 36(20), 7109–-7122.
##' \item Li et al (2024), Detection and Attribution Analysis of Temperature Changes with Estimating Equations,
##'       \emph{Submitted to Journal of Climate}.
##' }
##' @examples
##' ## load the example dataset
##' data(simDat)
##' Cov <- simDat$Cov[[1]]
##' ANT <- simDat$X[, 1]
##' NAT <- simDat$X[, 2]
##' 
##' ## generate the simulated data set
##' ## generate regression observation
##' Y <- MASS::mvrnorm(n = 1, mu = ANT + NAT, Sigma = Cov)
##' ## generate the forcing responses
##' mruns <- c(1, 1)
##' Xtilde <- cbind(MASS::mvrnorm(n = 1, mu = ANT, Sigma = Cov / mruns[1]),
##'                MASS::mvrnorm(n = 1, mu = NAT, Sigma = Cov / mruns[2]))
##' ## control runs
##' ctlruns <- MASS::mvrnorm(100, mu = rep(0, nrow(Cov)), Sigma = Cov)
##' ## ctlruns.sigma for the point estimation and ctlruns.bhvar for the interval estimation
##' ctlruns.sigma <- ctlruns.bhvar <- ctlruns
##' ## number of locations
##' S <- 25
##' ## number of year steps
##' T <- 10
##' 
##' ## call the function to estimate the signal factors via EE
##' fingerprint(Xtilde, Y, mruns,
##'           ctlruns.sigma, ctlruns.bhvar,
##'           S, T,
##'           ## B = 0, by default
##'           method = "EE",
##'           conf.level = 0.9,
##'           cal.a = TRUE,
##'           missing = FALSE, ridge = 0)
##' @import magrittr
##' @importFrom MASS mvrnorm
##' @importFrom stats cov qnorm quantile sd var pnorm
##' @importFrom utils tail
##' @importFrom pracma ceil
##' @importFrom janitor remove_empty
##' @importFrom grDevices boxplot.stats
##' @export fingerprint

fingerprint <- function(Xtilde, Y, mruns, 
                        ctlruns.sigma, ctlruns.bhvar,
                        S, T, B = 0,
                        Proj = diag(ncol(Xtilde)),
                        method = c("EE", "PBC", "TS", "TS.TempSt"),
                        cov.method = c("l2", "mv"),
                        conf.level = 0.90,
                        missing = FALSE,
                        cal.a = TRUE, ridge = 0) {
  Call <- match.call()
  Xtilde <- as.matrix(Xtilde)
  method <- match.arg(method)  ## method for the estimating procedure
  cov.method <- match.arg(cov.method)  ## method for the covariance matrix extimation in PBC and TS
  if(! method %in% c("EE", "PBC", "TS", "TS.TempSt")) {
    stop("Unknow method for signal estimation")
  }
  if(is.null(colnames(Xtilde))) {
    colnames(Xtilde) <- paste0("forcings ", 1:ncol(Xtilde))
  }
  ## for the EE method
  if(method == "EE") {
    ## check the missing value
    if(any(is.na(Y))) {
      if(!missing) {
        message("missing values in observations")
      }
      missing <- TRUE
    } else {
      if(missing) {
        message("no missing values in observations")
      }
      missing <- FALSE
    }
    output <- fingerprintCEE(
      Xtilde = Xtilde, Y = Y, mruns = mruns,
      ctlruns.1 = ctlruns.sigma, 
      ctlruns.2 = ctlruns.bhvar,
      Proj = Proj,
      nS = S, nT = T, nB = B,
      conf.level = conf.level,
      cal.a = cal.a, missing = missing,
      ridge = ridge
    )
  } else {
    if (cov.method == "l2") {
      cov <- Covest(ctlruns.sigma, method = "l2")$output
    } else if (cov.method == "mv") {
      cov <- Covest(ctlruns.sigma, method = "mv", bandwidth = 0.35)$output
    } else {
      stop("Unknow method for covariance matrix estimation")
    }
    
    if(missing) {
      ## check the missing value
      nomis <- which(!is.na(Y))
    } else {
      nomis <- 1:length(Y)
    }
    ## for the PBC method
    if(method == "PBC") {
      output <- fingerprintTLS(
        X = Xtilde[nomis, , drop = FALSE], 
        Y = Y[nomis, , drop = FALSE],
        cov = cov[nomis, nomis, drop = FALSE], nruns.X = mruns,
        ctlruns = ctlruns.sigma[, nomis, drop = FALSE],
        precision = FALSE,
        conf.level = conf.level,
        conf.method = "PBC",  ## only output results from PBC (TSB also available)
        cov.method = cov.method,
        B = B)
    }
    ## for the two sample approach
    if(method == "TS") {
      output <- fingerprintTS(X = Xtilde[nomis, , drop = FALSE],
                              Y = Y[nomis, , drop = FALSE],
                              nruns.X = mruns, cov = cov[nomis, nomis, drop = FALSE],
                              Z.2 = ctlruns.bhvar[, nomis, drop = FALSE],
                              precision = FALSE,
                              conf.level = conf.level,
                              B = 1000)
    }
    if(method == "TS.TempSt") {
      # cov <- Covest(ctlruns.sigma, method = "l2")$output
      ## check number of time steps, if C > 1, use the pooled data to estimate
      ## the covariance matrix as a block diagonal matrix (a spatial-temporal covariance 
      ## matrix with temporal independence).
      if(T == 1){
        cov <- lwRegcov(ctlruns.sigma)
      } else {
        if(S == 1) {
          cov <- diag(T) %x% cov(t(poolDat(X = t(ctlruns.sigma), C = T, ni = S)))
        } else {
          cov <- diag(T) %x% lwRegcov(t(poolDat(X = t(ctlruns.sigma), C = T, ni = S)))
        }
      }
      output <- fingerprintTS(X = Xtilde[nomis, , drop = FALSE],
                              Y = Y[nomis, , drop = FALSE],
                              nruns.X = mruns, cov = cov[nomis, nomis, drop = FALSE],
                              Z.2 = ctlruns.bhvar[, nomis, drop = FALSE],
                              precision = FALSE,
                              conf.level = conf.level,
                              B = 1000)
    }
    
        ## handling the projection matrix
    if(any(Proj != diag(Xtilde))) {
      ## estimation
      beta.Proj <- Proj %*% as.vector(output$beta)
      var.Proj <- Proj %*% output$var %*% t(Proj)
      sd.Proj <- sqrt(diag(output$var))
      ## confidence interval
      norm.crt <- qnorm(1 - (1 - conf.level)/2)  ## critical value for normal approximation
      ci.Proj <- cbind(beta.Proj - norm.crt * sd.Proj,
                       beta.Proj + norm.crt * sd.Proj)
      ## summarize the results
      output$beta.Proj <- beta.Proj
      output$var.Proj <- var.Proj
      output$ci.Proj <- ci.Proj
      ## rename the results
      if(is.null(rownames(Proj))) {
        Xlab <- paste0("forcings ", 1:ncol(Xtilde))
      } else {
        Xlab <- rownames(Proj)
      }
      names(output$beta.Proj) <- Xlab
      rownames(output$var.Proj) <- colnames(output$var.Proj) <- Xlab
      rownames(output$ci.Proj) <- Xlab
      colnames(output$ci.Proj) <- c("Lower bound", "Upper bound")
    }
    # 
    # ## combine the results
    # result <- list(beta = beta.hat,
    #                var = sd.estim,
    #                ci = ci.estim)
    # 
    # result <- list(beta = beta.hat,
    #                ci = ci.estim, 
    #                var = sd.estim, 
    #                pbc.ratio = ratio)

  }
  ## return the output
  return(output)
}