segCCR: Segmented correspondence curve regression

Documented in segCCR

#' @title Fit the segmented correspondence curve regression
#'
#' @rdname segCCR
#'
#'
#'
#' @param data
#' an optional data frame, list or environment
#' (or object coercible by as.data.frame to a data frame)
#' containing the variables in the model. If not found in data,
#' the variables are taken from environment(formula),
#' typically the environment from which segCCR is called.
#' @param par.ini
#' the initial values for the estimate parameters. The first component is
#' the change point. If is.null(par.ini) == TRUE, par.ini is set in the
#' the details.
#' @param tm
#' The vector of tm.
#' @param NB
#' The bootstrap times to obtain the estimated standard errors.
#' @param sig.level
#' The significant level. Default is 0.05.
#'
#' @description Fit the correspondence curve regression
#'
#' @details Please refer to \strong{Zhang, F. and Li, Q. (2022)}.
#'
#' @references Zhang, F. and Li, Q. (2022).
#' Segmented correspondence curve regression for quantifying
#' covariate effects on the reproducibility of high-throughput experiments.
#'
#'
#' @return A list with the elements:
#'
#' \describe{
#'   \item{coefficients}{a named vector of coefficients.}
#'   \item{std.error}{the estimated standard errors.}
#'   \item{CI}{the confidence intervals.}
#'   \item{pv}{the p-values for individual test.}
#'   \item{jointpv}{the p-values for joint test.}
#'   }
#'
#' @author Feipeng Zhang and Qunhua Li
#'
#' @keywords segCCR

#' @import stats
#' @importFrom boot boot
#' @export
#'
#' @examples
#'## The example of ChIP-seq data
#'\dontrun{
#' data(ChIPseq)
#' ## estimate
#' m = 100
#' tm <- seq(0.01, 0.999, length.out = m)
#' nx = nlevels(factor(ChIPseq$x))
#' par.ini = c(0.5, 2, 1, rep(0.1, 2*(nx-1)))  # initial value
#' fit = segCCR(data = ChIPseq,
#'            par.ini = par.ini,
#'            tm=tm,
#'            NB = 5)
#' }





segCCR <- function(data, par.ini, tm, NB=100, sig.level = 0.05){
  y1 = data$y1
  y2 = data$y2
  x = data$x

  y = data.frame(y1, y2)
  z = segCCR_fit(y, x, par.ini, tm, NB, sig.level)

  z

}

segCCR_fit <- function(y, x, par.ini, tm, NB, sig.level)
{
  ff = cbind(y1,y2)~x
  utils::str(m <- model.frame(ff, ChIPseq))
  x <- model.matrix(ff, m)

  if (is.null(n <- nrow(x))) stop("'x' must be a matrix")
  if(n == 0L) stop("0 (non-NA) cases")
  p <- ncol(x)
  if (p == 0L) {
    ## ops, null model
    return(list(coefficients = numeric() ))
  }


  ny <- NCOL(y)
  ## treat one-col matrix as vector
  if(is.matrix(y) && ny != 2)
    stop("y need have 2 columns")
  if (NROW(y) != n)
    stop("incompatible dimensions")
  # chkDots(...)

  m <- length(tm)

  y1 <- y[,1]
  y2 <- y[,2]
  dat <- cbind(y1, y2, x)

  xf <- factor(x[,-1])
  px <- nlevels(xf)
  x.unique <- levels(xf)


  ## some preliminary functions
  gfun <- function(x){log(x+1e-10)}
  gfun.inv <- function(x){exp(x)}
  gfun.inv.dev1 <- function(x){exp(x)}
  gfun.inv.dev2 <- function(x){exp(x)}

  hfun <- function(x){log(x+1e-10)}
  hfun.dev1 <- function(x) {1/(x+1e-10)}
  hfun.dev2 <- function(x) {-1/(x+1e-10)^2}

  ### calculation Uim and correpondence curve
  prepare.Uim <- function(a, tm){ ### Uim

    n <- nrow(a)
    m <- length(tm)

    # a: (y_i1, y_i2) scores
    a.cdf1 <- ecdf(a[,1])
    a.cdf2 <- ecdf(a[,2])
    cdf1.value <- a.cdf1(a[,1])*n/(n+1)
    cdf2.value <- a.cdf2(a[,2])*n/(n+1)


    # get counts in each category
    wt.temp <- 1*(1*outer(cdf1.value, tm, "<=") + 1*outer(cdf2.value, tm, "<=")==2)
    wt <- cbind(wt.temp[, 1], wt.temp[, -1] - wt.temp[, -m])     # n*m

    invisible(wt)
  }

  ################################################

  # initial values
  if(is.null(par.ini)){
    par.ini = c(0.5, 2, 1, rep(0.1, 2*(px-1)))
  }
  ## initial values for regression coefficients
  bets.ini = par.ini[-1]

  ## initial for the change point
  pini = par.ini[1]
  if(length(tm)<=30){
    ptlow <- max(pini - 0.2, 1e-2)
    ptup <- min(pini + 0.2, 1-1e-2)
  }else{
    ptlow <- max(pini - 0.1, 1e-2)
    ptup <- min(pini + 0.1, 1-1e-2)
  }

  taus <- tm[ptlow <= tm & tm <= ptup]   # grid search range for change point


  ##### fit segCCR
  segCCR.fit <- function(dat, tm, taus, par.ini){

    y1 <- dat[,1]
    y2 <- dat[,2]
    x <- dat[ , -c(1,2)]

    xf <- factor(x[,-1])
    px <- nlevels(xf)

    #### likelihood function
    likFun <- function(bets, tau){

      if(px==0){

        wt <- prepare.Uim(cbind(y1, y2), tm)
        ht <- hfun(tm)
        htau <- hfun(tau)
        zz1 <- (ht-htau) * ifelse(tm <= tau, 1, 0)
        zz2 <- htau + (ht-htau) * ifelse(tm > tau, 1, 0)
        zz = cbind(zz1, zz2)
        coef <-  drop(zz %*% bets)

        ght <- gfun.inv(coef)
        dght <- c(ght[1], diff(ght))
        lpr <- ifelse(dght > 0, log(dght), Inf)
        lik <- sum(wt %*% lpr)

      }else{
        (modmat <- model.matrix(~ xf))
        lables <- apply(modmat, 1, paste, collapse ="|")
        lables_uqi <- unique(lables)

        lik <- 0
        for (i in 1:length(lables_uqi)){
          is.in <- which(lables==lables_uqi[i])
          y1.x <- y1[is.in]
          y2.x <- y2[is.in]

          x.i <- unlist(strsplit(lables_uqi[i],split = "|"))
          x.i <- as.numeric(x.i[which(!(x.i=="|"))])

          wt <- prepare.Uim(cbind(y1.x, y2.x), tm)

          ht <- hfun(tm)
          htau <- hfun(tau)
          zz1 <- (ht-htau) * ifelse(tm <= tau, 1, 0)
          zz2 <- htau + (ht-htau) * ifelse(tm > tau, 1, 0)
          zz <- rbind(zz1, zz2)

          coef <- drop(x.i%*% matrix(bets, ncol=2, byrow = TRUE) %*% zz)
          ght <- gfun.inv(coef)
          dght <- c(ght[1], diff(ght))

          lpr <- ifelse(dght > 0, log(dght), Inf)
          lik <- lik + sum(wt %*% lpr)
        }
      }



      return(lik)
    }

    lik <- NULL
    n.tau <- length(taus)
    betcoef <- matrix(0, nrow = n.tau, ncol = 2*px)
    for (kk in 1:n.tau){

      fit <- try(
        optim(
          par = bets.ini,
          fn = function(bets){-likFun(bets, taus[kk])},
          method = "Nelder-Mead",
          hessian = FALSE),
        silent = TRUE)

      if(isTRUE(class(fit)=="try-error")) { NA }
      else {
        # optimize minimization
        betcoef[kk, ] <- fit$par
        lik[kk] <- (-1)*fit$value
      }

    }


    ## find the change-point
    bp <- min(taus[lik == max(lik, na.rm = TRUE)] )

    ## estimate the regression coefficients
    bethat <- betcoef[taus == bp, ]

    thethat <- c(bp, bethat)

    return(list(thethat=thethat))
  }

  ## estimated coefficients
  fit =  segCCR.fit(dat, tm, taus, par.ini)
  thethat = fit$thethat

  ## bootstrap variance estimate
  segccr_cov <- function(dat, tm, taus, par.ini, NB){

    ## by bootstrap with parallel computation
    segccr_boot <- function(dat, boot.indx){

      # bootstrap the data
      #boot.indx <- sample(nrow(dat), replace = T)
      dat <- dat[boot.indx, ]

      fit <- segCCR.fit(dat, tm, taus, par.ini)

      thethat <- fit$thethat

      invisible(return(thethat))

    }

    # thet.boot <- boot(dat, segccr_boot, R = NB, parallel="multicore",
    #                   ncpus = getOption("boot.ncpus", n.cls))
    thet.boot <- boot(dat, segccr_boot, R = NB)

    thet.cov <- cov(thet.boot$t)
    #thet.ese <- apply(thet.boot$t, 2, sd, na.rm = TRUE)

    invisible(return(thet.cov))
  }


  ## save the bootstrap variance estimate, which is time consuming
  thetcov = segccr_cov(dat, tm, taus, par.ini, NB=3)
  thetse = sqrt(diag(thetcov))

  ## H0: beta_sj=0
  thetpv <- 2*(1-pnorm(abs(thethat/thetse), 0, 1))


  ## joint test: H0: beta_s1=beta_s2=0
  est.beta = thethat[-1]
  cov.beta = thetcov[-1, -1]
  jtest.pvs = c()
  for (k in 1:px){
    Rmat.k = cbind(matrix(0, nrow = 2, ncol = 2*(k-1)), diag(2), matrix(0, nrow = 2, ncol = 2*(px-k) ) )
    Wn.k = drop(t(Rmat.k%*%est.beta) %*% solve(Rmat.k%*%cov.beta%*%t(Rmat.k)) %*% (Rmat.k%*%est.beta))
    # jtest.pvs[k] = 2*min(pchisq(q=Wn.k, df=2, lower.tail=FALSE), pchisq(q=Wn.k, df=2, lower.tail=TRUE))
    jtest.pvs[k] = pchisq(q=Wn.k, df=2, lower.tail=FALSE)
  }


  ## names
  est <- thethat
  ese <- thetse

  if(px==0){
    dn1 <- c(expression(tau),
             expression(beta["01"]),
             expression(beta["02"])
    )
  }else{

    dn0 <- levels(xf)[-1]
    dn1 <- c(expression(tau),
             expression(beta["01"]),
             expression(beta["02"]),
             paste0(rep(dn0, each=2),
                    rep(c("1", "2"), times=length(dn0)), seq = ""))
  }

  # plot(1,1, main=expression('title'^2))  #superscript
  # plot(1,1, main=expression('title'[2])) #subscript



  names(est) <- dn1


  CI.low <- est - qnorm(1-sig.level/2, 0, 1) * ese
  CI.up <- est + qnorm(1-sig.level/2, 0, 1) * ese

  z = list()
  z$coefficients <- est
  z$std.error <- ese
  z$CI <- cbind(CI.low, CI.up)
  z$pv <- thetpv
  z$jointpv <- jtest.pvs

  return(c(z[c("coefficients", "std.error", "CI", 'pv', "jointpv")]))

}