R/est.IDR.discrete.R

#' EM algorithm to estimate the copula mixture model for discrete data

#' @param x replicate 1 : a vector of values from original observation
#' @param y replicate 2 : paired with \code{x}, a repeated measure having same length as \code{x}.
#' @param mu a starting value for the mean of the reproducible component.
#' @param sigma a starting value for the standard deviation of the reproducible component.
#' @param rho a starting value for the correlation coefficient of the reproducible component.
#' @param p a starting value for the proportion of reproducible component.
#' @param eps a small number to control convergence. For example, \code{esp=0.0001}.
#' @param n.missing \code{n.missing=0} indicates that we run the program as if there is no missing
#'       values; \code{n.missing=}positive number: the program estimates missing values.
#' @param miss.sym symbol for missing value. For example, \code{miss.sym=0}.
#' @param as.single.loglik an integer n to control the precision of
#'          numberical integration. If n=1, the computation is exact.
#'         If n>1, only integrate the bins with counts more than n,
#'          and treat bins with counts <= n as singletons in the
#'          likelihood computation
#' @param as.single.em similar to \code{as.single.loglik}, except that
#'                     it does the approximation in EM
#' @param common.only If \code{TRUE}, the computation will only use the
#'                    nonmissing entries. In this case, \code{n.missing}
#'                    should be set to 0. If \code{FALSE}, it will use
#'                    all the entries.
#' @param labels the cdf on the left side of the lower boundary of the bins
#' @return
#' \itemize{
#'   \item{para   }{estimated parameters: p, rho, mu, sigma, estimated n.missing.}
#'   \item{loglik  }{can be used for mapping category format back to original
#'                   e.g. \code{x.cat[level.factor[1]] == x[1], y.cat[level.factor[1]]==y[1]}.}
#'   \item{loglik.trace	 }{trajectory of log-likelihood.}
#'   \item{idr.cat  }{a numeric vector of the local idr for each category (i.e. estimated conditional probablility for each observation to belong to the irreproducible component).}
#'   \item{IDR.cat  }{a numerical vector of the expected irreproducible discovery rate for categories that are as irreproducible or more irreproducible than the given categories.}
#'   \item{idr.obs  }{a numeric vector of the local idr for each observation.}
#'   \item{IDR.obs  }{a numerical vector of the expected irreproducible discovery rate for observations that are as irreproducible or more irreproducible than the given observations.}
#' }
#'
#' @details
#' EM to compute the latent structure
#' steps:
#' 1. raw values are first transformed into pseudovalues
#' 2. EM is used to compute the underlining structure, which is a mixture of two normals
#'
#' @examples
#'
#' #load chip_seq data
#' data(chip_seq)
#' x = chip_seq[,1]
#' y = chip_seq[,2]
#'
#' # Initiation
#' mu <- 2.6
#' sigma <- 1.3
#' rho <- 0.8
#' p <- 0.7
#' eps <- 0.001
#' n.missing <- 0
#'
#' # Estimate parameters of mixture model
#' gidr.out <- est.IDR.discrete(x, y, mu, sigma, rho, p, eps, n.missing,
#'                             miss.sym = 0, as.single.loglik = 1,
#'                             as.single.em = 1, common.only=TRUE, labels=NULL)
#'
#' names(gidr.out)

est.IDR.discrete <- function(x, y, mu, sigma, rho, p, eps, n.missing, miss.sym, as.single.loglik, as.single.em, common.only=TRUE, labels=NULL){

  if(n.missing > 0)
    top.missing <- TRUE
  else
    top.missing <- FALSE

  x.ori <- x
  y.ori <- y

  # keep the non-missing ones
  if(common.only){
    which.nomissing <- x != miss.sym & y != miss.sym
    x <- x[which.nomissing]
    y <- y[which.nomissing]
    n.missing <- 0
    top.missing <- FALSE
  }

  if(!is.null(labels))
    xy.cat <- get.cat.xy(x, y, n.missing, miss.sym, top.missing, labels)
  else{
    xy.cat <- get.cat.xy(x, y, n.missing, miss.sym, top.missing)
  }
  xy <- xy.cat$cat.xy

  # initialization
  para <- list()
  para$mu <- mu
  para$sigma <- sigma
  para$rho <- rho
  para$p <- p
  para$m <- xy$m

  j <- 1
  to.run <- T
  loglik.trace <- c()
  loglik.inner.trace <- c()

  z1.pseudo <- get.pseudo.mix.map2(xy$x.cdf, xy$x.cdf.lo, para$mu, para$sigma,  para$p)
  z1.up <- z1.pseudo$x.pseudo
  z1.lo <- z1.pseudo$x.pseudo.lo

  z2.pseudo <- get.pseudo.mix.map2(xy$y.cdf, xy$y.cdf.lo, para$mu, para$sigma,  para$p)
  z2.up <- z2.pseudo$x.pseudo
  z2.lo <- z2.pseudo$x.pseudo.lo

  l.new.outer <- loglik.2binormal.discrete(cbind(z1.lo, z1.up), cbind(z2.lo, z2.up), para$m, para$mu, para$sigma, para$rho, para$p, as.single.loglik, top.missing)
  loglik.trace[1] <- l.new.outer

  while(to.run){

    i <- 1
    j <- j+1
    #  print
    cat("\nj=", j, "\n")
    cat("EM: loglik.inner.trace\n")

    para.old <- para
    loglik.inner.trace[1] <- l.new.outer

    cat("before EM=", loglik.inner.trace[1], "\n")

    # run EM for a given pseudo-value
    to.run.EM <- TRUE
    while(to.run.EM){
      i <- i+1
      # EM for latent structure
      para <- em.step.2normal.discrete(cbind(z1.lo, z1.up), cbind(z2.lo, z2.up), para$m, para$mu, para$sigma, para$rho, para$p, as.single.em, top.missing)

      #################
      # fix sigma
      # para$sigma <- sigma
      ################
      # this is just the mixture likelihood of two-component Gaussian
      loglik.inner.trace[i] <- loglik.2binormal.discrete(cbind(z1.lo, z1.up), cbind(z2.lo, z2.up), para$m, para$mu, para$sigma, para$rho, para$p, as.single.loglik, top.missing)

      # print likelihood
      #    cat("i=", i, ": ", loglik.inner.trace[i], "\n")
      #    cat("mu=", para$mu, "sigma=", para$sigma, "p=", para$p, "rho=", para$rho, "n.missing=", para$m[1], "\n")
      cat("EM ends\n")

      if(abs(loglik.inner.trace[i]-loglik.inner.trace[i-1])<=eps | i > 100 | para$p>0.999 | para$p<0.001 | para$rho>0.999 | para$rho < -0.999)
        to.run.EM <- FALSE
    }

    #   cat("\nout of EM loop, in pseudo-value\n")
    # update pseudo values
    # If new likelihood is lower than previous one, use mid-point between
    # new and old
    to.run.outer <- T  # always run the first iteration
    k <- 0
    while(to.run.outer & k<10){

      #     xy.cat <- get.cat.xy(x, y, para$m[1], miss.sym, top.missing)
      if(!is.null(labels))
        xy.cat <- get.cat.xy(x, y, n.missing, miss.sym, top.missing, labels)
      else{
        xy.cat <- get.cat.xy(x, y, n.missing, miss.sym, top.missing)
      }
      xy <- xy.cat$cat.xy
      z1.pseudo <- get.pseudo.mix.map2(xy$x.cdf, xy$x.cdf.lo, para$mu, para$sigma, para$p)
      z1.up <- z1.pseudo$x.pseudo
      z1.lo <- z1.pseudo$x.pseudo.lo

      z2.pseudo <- get.pseudo.mix.map2(xy$y.cdf, xy$y.cdf.lo, para$mu, para$sigma, para$p)
      z2.up <- z2.pseudo$x.pseudo
      z2.lo <- z2.pseudo$x.pseudo.lo

      l.new.outer <- loglik.2binormal.discrete(cbind(z1.lo, z1.up), cbind(z2.lo, z2.up), para$m, para$mu, para$sigma, para$rho, para$p, as.single.loglik, top.missing)

      cat("l.new.outer=",l.new.outer, " loglik.inner.trace[1]=", loglik.inner.trace[1], "\n")
      if(l.new.outer < loglik.inner.trace[1]-eps){ # if likelihood is small, only go half step
        para$m <- (para$m+para.old$m)/2
        para$mu <- (para$mu+para.old$mu)/2
        para$rho <- (para$rho+para.old$rho)/2
        para$p <- (para$p+para.old$p)/2
        para$sigma <- (para$sigma+para.old$sigma)/2
        k <- k+1
      } else{
        to.run.outer <- FALSE # no need to adjust step
      }

      cat("k=", k, "\n")
      # if still cann't be better than old version, just use the old one
      if(k>= 10){
        para$m <- para.old$m
        para$mu <- para.old$mu
        para$rho <- para.old$rho
        para$p <- para.old$p
        para$sigma <- para.old$sigma
        l.new.outer <- loglik.inner.trace[1]

        if(!is.null(labels))
          xy.cat <- get.cat.xy(x, y, n.missing, miss.sym, top.missing, labels)
        else{
          xy.cat <- get.cat.xy(x, y, n.missing, miss.sym, top.missing)
        }
        #      xy.cat <- get.cat.xy(x, y, para$m[1], miss.sym, top.missing)
        xy <- xy.cat$cat.xy
        z1.pseudo <- get.pseudo.mix.map2(xy$x.cdf, xy$x.cdf.lo, para$mu, para$sigma, para$p)
        z1.up <- z1.pseudo$x.pseudo
        z1.lo <- z1.pseudo$x.pseudo.lo

        z2.pseudo <- get.pseudo.mix.map2(xy$y.cdf, xy$y.cdf.lo, para$mu, para$sigma, para$p)
        z2.up <- z2.pseudo$x.pseudo
        z2.lo <- z2.pseudo$x.pseudo.lo

      }

    }

    loglik.trace[j] <- l.new.outer

    cat("loglik.outer.trace=", loglik.trace[j])
    cat("\n\n")

    # stop when iteration>50
    if(j > 50 | abs(loglik.trace[j] - loglik.trace[j-1])< eps)
      to.run <- FALSE
    #    else
    #        to.run <- l.new.outer - l.old.outer > eps
  }

  #  cat("\nI am here\n", "to.run=", to.run)
  if(top.missing)
    n.est.missing <- para$m[1]
  else
    n.est.missing <- 0

  # calculate idr and IDR for each category
  idr.cat <- 1-para$e.z
  IDR.cat <- get.IDR.discrete(idr.cat, xy$m)

  # calculate idr and IDR for each observation and map to the original obs
  if(common.only){
    idr.obs <- rep(NA, length(x.ori))
    IDR.obs <- rep(NA, length(x.ori))
    idr.obs[which.nomissing] <- idr.cat[xy.cat$level.factor]
    IDR.obs[which.nomissing] <- IDR.cat[xy.cat$level.factor]
  } else {
    num.obs <- (length(xy.cat$level.factor)-length(x.ori)+1):(length(xy.cat$level.factor))
    idr.obs <- idr.cat[xy.cat$level.factor][num.obs]
    IDR.obs <- IDR.cat[xy.cat$level.factor][num.obs]
  }

  return(list(para=list(p=para$p, rho=para$rho, mu=para$mu, sigma=para$sigma, n.missing=n.est.missing),
              loglik=l.new.outer,
              #e.z=para$e.z,
              loglik.trace=loglik.trace,
              #data=xy.cat,
              idr.cat=idr.cat,
              IDR.cat=IDR.cat,
              idr.obs=idr.obs,
              IDR.obs=IDR.obs))
}
qunhualilab/gIDR documentation built on May 14, 2019, 10:38 a.m.