R/all.R
In idr: Irreproducible Discovery Rate

Documented in comp.uri d.binormal e.step.2normal est.IDR get.correspondence get.pseudo.mix get.uri.2d loglik.2binormal m.step.2normal select.IDR

# Author: Qunhua Li
# Affiliation: UC Berkeley, Peter Bickel's group
# Reference:
#   Q. Li, J. B. Brown, H. Huang and P. J. Bickel. (2011)
#   Measuring reproducibility of high-throughput experiments.
#   Annals of Applied Statistics. In press.
#
# Upon using this code, you automatically agree to cite the reference above
# in all publications that include results generated from this code or this
# R code itself.
#
# License: GPL-2 

est.IDR <-
function(x, mu, sigma, rho, p, eps=0.001, max.ite=30){

# This is the main function estimates parameters from the copula mixture models
# Input:
#   x: a m x n numeric matrix, where m is the number of replicates and n is
#      the number of entries on each replicate
#   mu, sigma, rho, p: starting values for mean, sd, correlation coefficient
#      and mixing proportion of the reproducible component
#
# Output:
#   (p, rho, mu, sigma): estimates
#   loglik: the mixture likelihood at the end of iterations
#   bic: bic
#   e.z: proterior probability for each observation to be reproducible, i.e.
#        1-local idr
#   loglik.trace: the trace of mixture likelihood  
#
# Implementation details: iterate between the following two steps:
# 1. raw values are first transformed into pseudovalues
# 2. EM is used to compute the underlining structure, which is a mixture
#    of two normals
  
    conv <- function(old,new) abs(new-old) < eps*(1+abs(new))
    
    x1 <- x[, 1]
    x2 <- x[, 2]
    x1.cdf.func <- ecdf(x1)
    x2.cdf.func <- ecdf(x2)
    afactor <- length(x1)/(length(x1) + 1)
    x1.cdf <- x1.cdf.func(x1) * afactor
    x2.cdf <- x2.cdf.func(x2) * afactor
    para <- list()
    para$mu <- mu
    para$sigma <- sigma
    para$rho <- rho
    para$p <- p
    j <- 1
    to.run <- TRUE
    loglik.trace <- c()
    loglik.inner.trace <- c()
    z.1 <- get.pseudo.mix(x1.cdf, para$mu, para$sigma, para$rho, 
        para$p)
    z.2 <- get.pseudo.mix(x2.cdf, para$mu, para$sigma, para$rho, 
        para$p)
    while (to.run) {
        i <- 1
        while (to.run) {
            e.z <- e.step.2normal(z.1, z.2, para$mu, para$sigma, 
                para$rho, para$p)
            para <- m.step.2normal(z.1, z.2, e.z)
            if (i > 1) 
                l.old <- l.new
            l.new <- loglik.2binormal(z.1, z.2, para$mu, para$sigma, 
                para$rho, para$p)
            loglik.inner.trace[i] <- l.new
            if (i > 1) {
                to.run <- !conv(loglik.inner.trace[i-1],loglik.inner.trace[i])
            }
            i <- i + 1
        }
       
        z.1 <- get.pseudo.mix(x1.cdf, para$mu, para$sigma, para$rho, 
            para$p)
        z.2 <- get.pseudo.mix(x2.cdf, para$mu, para$sigma, para$rho, 
            para$p)
        if (j > 1) 
            l.old.outer <- l.new.outer
        l.new.outer <- loglik.2binormal(z.1, z.2, para$mu, para$sigma, 
            para$rho, para$p)

        loglik.trace[j] <- l.new.outer
        if (j == 1) 
            to.run <- TRUE
        else {
            if (j > max.ite) 
                to.run <- FALSE
            else to.run <- !conv(l.old.outer,l.new.outer)
            
        }
        j <- j + 1
    }
    idr <- 1 - e.z
    o <- order(idr)
    idr.o <- idr[o]
    idr.rank <- rank(idr.o, ties.method = "max")
    top.mean <- function(index, x) {
        mean(x[1:index])
    }
    IDR.o <- sapply(idr.rank, top.mean, idr.o)
    IDR <- idr
    IDR[o] <- IDR.o
    return(list(para = list(p = para$p, rho = para$rho, mu = para$mu, 
        sigma = para$sigma), loglik = l.new, loglik.trace = loglik.trace, 
        idr = 1 - e.z, IDR = IDR))
}

#est.IDR.hist <- function(x, mu, sigma, rho, p, eps, max.ite=30){

  # currently it is a placeholder, to add real function
  
#  est <- est.IDR(x, mu, sigma, rho, p, eps, max.ite=30)
#  return(est)
#}


#est.IDR.discrete <- function(x, mu, sigma, rho, p, eps, max.ite=30){

  # currently it is a placeholder, to add real function
#  est <- est.IDR(x, mu, sigma, rho, p, eps, max.ite=30)
#  return(est)
#}


select.IDR <-
  function(x, IDR.x, IDR.level){

    # Select observations that exceeding a given IDR level
    #
    # Input:
    #   x: a m by n numeric matrix, where m= num of replicates,
    #      n=num of observations. Numerical values representing the
    #      significance of the observations, where signals are expected to have
    #      large values, for example, -log(p-value).   Currently, m=2. 
    #   IDR.x: Irreproducibile discovery rate for each entry of x.
    #      It is computed from est.IDR().
    #   IDR.level: IDR cutoff, a numerical value between [0,1].
    #
    # Output:
    #   x: Observations that are selected.
    #   n: Number of observations that are selected.
    #   IDR.level: IDR cutoff, a numerical value between [0,1].
    #

    is.selected <- IDR.x < IDR.level
    x.selected <- x[is.selected,]
    n.selected <- nrow(x.selected)
    
    return(list(x=x.selected, n=n.selected, IDR.level=IDR.level))
  }


get.correspondence <-
function(x1, x2, t, spline.df=NULL){
  
# compute the correspondence profile
# Input:
#   x1: Data values or ranks of the data values on list 1, a vector of
#    numeric values. Large values need to be significant signals. If small
#    values represent significant signals, rank the signals reversely
#    (e.g. by ranking negative values) and use the rank as x1.
#   x2: Data values or ranks of the data values on list 2, a vector of
#    numeric values. Large values need to be significant signals. If small
#    values represent significant signals, rank the signals reversely
#    (e.g. by ranking negative values) and use the rank as x1.
#   t: A numeric vector between 0 and 1 in ascending order. t is the
#    right-tail percentage.
#   spline.df: Degree of freedom for spline, to control the smoothness of the
#    smoothed curve. 
#
# Output:
#  psi: the correspondence profile in terms of the scale of percentage,
#     i.e. between (0, 1)
#  dpsi: the derivative of the correspondence profile in terms of the scale
#     of percentage, i.e. between (0, 1)}
#  psi.n: the correspondence profile in terms of the scale of the number of
#     observations
#  dpsi.n: the derivative of the correspondence profile in terms of the scale
#     of the number of observations}
#
#  Each object above is a list consisting of the following items:
#  t: upper percentage (for psi and dpsi) or number of top ranked
#  observations (for psi.n and dpsi.n) 
#  value: psi or dpsi
#  smoothed.line: smoothing spline
#  ntotal: the number of observations
#  jump.point: the index of the vector of t such that psi(t[jump.point])
#              jumps up due to ties at the low values. This only
#	      happends when data consists of a large number of discrete
#	      values, e.g. values imputed for observations
#	      appearing on only one replicate. 
  
  psi.dpsi <- get.uri.2d(x1, x2, t, t, spline.df)

  psi <- list(t=psi.dpsi$tv[,1], value=psi.dpsi$uri, 
              smoothed.line=psi.dpsi$uri.spl, ntotal=psi.dpsi$ntotal, 
              jump.point=psi.dpsi$jump.left)

  dpsi <- list(t=psi.dpsi$t.binned, value=psi.dpsi$uri.slope,
               smoothed.line=psi.dpsi$uri.der, ntotal=psi.dpsi$ntotal, 
               jump.point=psi.dpsi$jump.left)

  psi.n <- list(t=psi$t*psi$ntotal, value=psi$value*psi$ntotal, 
                 smoothed.line=list(x=psi$smoothed.line$x*psi$ntotal, 
                                    y=psi$smoothed.line$y*psi$ntotal),
                 ntotal=psi$ntotal, jump.point=psi$jump.point) 

  dpsi.n <- list(t=dpsi$t*dpsi$ntotal, value=dpsi$value, 
                 smoothed.line=list(x=dpsi$smoothed.line$x*dpsi$ntotal, 
                                    y=dpsi$smoothed.line$y),
                 ntotal=dpsi$ntotal, jump.point=dpsi$jump.point) 

  return(list(psi=psi, dpsi=dpsi, psi.n=psi.n, dpsi.n=dpsi.n))
}


get.uri.2d <-
function(x1, x2, tt, vv, spline.df=NULL){

  o <- order(x1, x2, decreasing=TRUE)
  
  # sort x2 by the order of x1
  x2.ordered <- x2[o]
  
  tv <- cbind(tt, vv)
  ntotal <- length(x1) # number of peaks    

  uri <- apply(tv, 1, comp.uri, x=x2.ordered)

  # compute the derivative of URI vs t using small bins
  uri.binned <- uri[seq(1, length(uri), by=4)]
  tt.binned <- tt[seq(1, length(uri), by=4)]
  uri.slope <- (uri.binned[2:(length(uri.binned))] - uri.binned[1:(length(uri.binned)-1)])/(tt.binned[2:(length(uri.binned))] - tt.binned[1:(length(tt.binned)-1)])

  # smooth uri using spline
  # first find where the jump is and don't fit the jump
  # this is the index on the left
  # jump.left.old  <- which.max(uri[-1]-uri[-length(uri)])
  short.list.length <- min(sum(x1>0)/length(x1), sum(x2>0)/length(x2))

  if(short.list.length < max(tt)){
    jump.left <- which(tt>short.list.length)[1]-1
  } else {
    jump.left <- which.max(tt)
  }

#  reversed.index <- seq(length(tt), 1, by=-1)
#  nequal <- sum(uri[reversed.index]== tt[reversed.index])
#  temp  <- which(uri[reversed.index]== tt[reversed.index])[nequal]
#  jump.left <- length(tt)-temp
 
  if(jump.left < 6){
   jump.left <- length(tt)
  }
    
 
  if(is.null(spline.df))
    uri.spl <- smooth.spline(tt[1:jump.left], uri[1:jump.left], df=6.4)
  else{
    uri.spl <- smooth.spline(tt[1:jump.left], uri[1:jump.left], df=spline.df)
  }
  # predict the first derivative
  uri.der <- predict(uri.spl, tt[1:jump.left], deriv=1)

  invisible(list(tv=tv, uri=uri, 
                 uri.slope=uri.slope, t.binned=tt.binned[2:length(uri.binned)], 
                 uri.spl=uri.spl, uri.der=uri.der, jump.left=jump.left,
                 ntotal=ntotal))
 }



#plot.correspondence <-
#  function(corr.profile, plot.type, plot.all){

   # Plot the correspondence profile for the output from get.correspondence()
   #  
   # Input:
   #   corr.profile: correspondence profile computed from get.correspondence()
   #   plot.type: "uri", "duri", "uri.n", "duri.n"
   #     "uri": upper rank intersection (y-axis) vs t (x-axis), where t is
   #            the percentage from upper side. This is the psi plot in the
   #            reference paper. 
   #     "duri": derivative of upper rank intersaction (y-axis) vs t (x-axis).
   #            This is the psi' plot in the reference paper
   #     "uri.n": same as "uri", except that x-axis is the number of
   #            observations
   #     "duri.n": same as "duri", except that x-axis is the number of
   #            observations
   #   plot.all: logical value.  
   #     If the rank lists consist of a large number of ties at the bottom
   #    (e.g. the same low value is imputed to the list for the observations
   #    that appear on only one list), it may be desirable to plot only
   #    observations before hitting the ties. True: plot all observations.
   #    False: plot observations before hitting the ties. 
    
#  if(plot.type=="uri"){  

    # plot correspondence curve on the scale of percentage
#    if(plot.all){
      
#      plot(corr.profile$psi$t, corr.profile$psi$value, xlab="t", ylab="psi", xlim=c(0, max(corr.profile$psi$t)), ylim=c(0, max(corr.profile$psi$value)), cex.lab=2)
#      lines(corr.profile$psi$smoothed.line, lwd=4)
#      abline(coef=c(0,1), lty=3)
#    } else {
    
    # If the rank lists consist of a large number of ties at the bottom
    # (e.g. the same low value is imputed to the list for the observations
    # that appear on only one list), it may be desirable to plot only
    # observations before hitting the ties. Then it can be plotted using the
    # following
#      plot(corr.profile$psi$t[1:corr.profile$psi$jump.point], corr.profile$psi$value[1:corr.profile$psi$jump.point], xlab="t", ylab="psi", xlim=c(0, max(corr.profile$psi$t[1:corr.profile$psi$jump.point])), ylim=c(0, max(corr.profile$psi$value[1:corr.profile$psi$jump.point])), cex.lab=2)
#      lines(corr.profile$psi$smoothed.line, lwd=4)
#      abline(coef=c(0,1), lty=3)
#    }
    
#  } else {

#    if(plot.type=="duri"){
      
      # plot the derivative of correspondence curve on the scale of percentage 
#      plot(corr.profile$dpsi$t, corr.profile$dpsi$value, xlab="t", ylab="psi'", xlim=c(0, max(corr.profile$dpsi$t)), ylim=c(0, max(corr.profile$dpsi$value)), cex.lab=2)
#      lines(corr.profile$dpsi$smoothed.line, lwd=4)
#      abline(h=1, lty=3)
#    } else {

#      if(plot.type=="uri.n"){
        
        # plot correspondence curve on the scale of the number of observations
#        plot(corr.profile$psi.n$t, corr.profile$psi.n$value, xlab="t", ylab="psi", xlim=c(0, max(corr.profile$psi.n$t)), ylim=c(0, max(corr.profile$psi.n$value)), cex.lab=2)
#        lines(corr.profile$psi.n$smoothed.line, lwd=4)
#        abline(coef=c(0,1), lty=3)
#      } else {
        
        # plot the derivative of correspondence curve on the scale of the
        # number of observations
#        plot(corr.profile$dpsi.n$t, corr.profile$dpsi.n$value, xlab="t", ylab="psi'", xlim=c(0, max(corr.profile$dpsi.n$t)), ylim=c(0, max(corr.profile$dpsi.n$value)), cex.lab=2)
#        lines(corr.profile$dpsi.n$smoothed.line, lwd=4)
#        abline(h=1, lty=3)
#      }
#    }
#  }
  
#}


##################### start internal functions #############

comp.uri <-
function(tv, x){

  # Internal function
  #
  # Compute the value of Psi and Psi' for a given t
  # An internal function to compute Psi for a given (t, v)
  
  # Input:
  #   tv: A vector of two numeric values, t and v. Both t and v are in [0,1]
  #   x: A numeric vector of x, sorted by the order of y
  #
  # Output:
  #   A numeric value of Psi(t, v)

  n <- length(x)
  qt <- quantile(x, prob=1-tv[1]) # tv[1] is t
  sum(x[1:ceiling(n*tv[2])] >= qt)/n

}


d.binormal <-
  function(z.1, z.2, mu, sigma, rho){

  # Internal function
  #  
  # Compute the log-density for parameterized bivariate Gaussian
  # distribution N(mu, mu, sigma, sigma, rho).
  #
  # Input:
  #   z.1: a numerical data vector on coordinate 1. 
  #   z.2: a numerical data vector on coordinate 1.
  #   mu: mean 
  #   sigma: standard deviation 
  #   rho: correlation coefficient
  #  
  # Output:
  #   Log density of bivariate Gaussian distribution
  #    N(mu, mu, sigma, sigma, rho).
  

  loglik <- (-log(2)-log(pi)-2*log(sigma) - log(1-rho^2)/2 - (0.5/(1-rho^2)/sigma^2)*((z.1-mu)^2 -2*rho*(z.1-mu)*(z.2-mu) + (z.2-mu)^2))

  return(loglik)
}

e.step.2normal <-
function(z.1, z.2, mu, sigma, rho, p){

  # Internal function
  #
  # Expectation step in the EM algorithm for parameterized bivariate
  # 2-component Gaussian mixture models with (1-p)N(0, 0, 1, 1, 0) +
  # pN(mu, mu, sigma, sigma, rho).

  # Input:
  #  z.1: a numerical data vector on coordinate 1.
  #  z.2: a numerical data vector on coordinate 2.
  #  mu: mean for the reproducible component.
  #  sigma: standard deviation of the reproducible component.
  #  rho: correlation coefficient of the reproducible component.
  #  p: mixing proportion of the reproducible component.
  #
  # Output:
  #  e.z: a numeric vector, where each entry represents the estimated expected
  #  conditional probability that an observation is in the reproducible
  #  component.

  e.z <- p/((1-p)*exp(d.binormal(z.1, z.2, 0, 1, 0)-d.binormal(z.1, z.2, mu, sigma, rho))+ p)
  
  invisible(e.z)
}

get.pseudo.mix <-
function(x, mu, sigma, rho, p){

  # Internal function
  #
  # Compute the pseudo values of a mixture model from the empirical CDF
  #
  # Input:
  #   x: A vector of values of empirical CDF
  #   mu: Mean of the reproducible component in the mixture model on the
  #       latent space
  #   sigma: Standard deviation of the reproducible component in the mixture
  #       model on the latent space
  #   rho: Correlation coefficient of the reproducible component in the
  #       mixture model on the latent space
  #   p: Mixing proportion of the reproducible component in the mixture model
  #      on the latent space
  #
  # Output:
  #   The values of a mixture model corresponding to the empirical CDF
  
  # first compute cdf for points on the grid
  # generate 200 points between [-3, mu+3*sigma]
  nw <- 1000
  w <- seq(min(-3, mu-3*sigma), max(mu+3*sigma, 3), length=nw) 
  w.cdf <- p*pnorm(w, mean=mu, sd=sigma) + (1-p)*pnorm(w, mean=0, sd=1)

  i <- 1

  quan.x <- rep(NA, length(x))

  for(i in c(1:nw)){
    index <- which(x >= w.cdf[i] & x < w.cdf[i+1])
    quan.x[index] <- (x[index]-w.cdf[i])*(w[i+1]-w[i])/(w.cdf[i+1]-w.cdf[i]) +w[i]
  }

  index <- which(x < w.cdf[1])
  if(length(index)>0)
    quan.x[index] <- w[1]

  index <- which(x > w.cdf[nw])
  if(length(index)>0)
    quan.x[index] <- w[nw]  
  
  invisible(quan.x)
}

loglik.2binormal <-
function(z.1, z.2, mu, sigma, rho, p){

  # Internal function
  #
  # Compute the log-likelihood for parameterized bivariate 2-component Gaussian
  # mixture models with (1-p)N(0, 0, 1, 1, 0) + pN(mu, mu, sigma,sigma, rho).
  # 
  # Input:
  #  z.1: a numerical data vector on coordinate 1. 
  #  z.2: a numerical data vector on coordinate 1. 
  #  mu: mean for the reproducible component. 
  #  sigma: standard deviation of the reproducible component. 
  #  rho: correlation coefficient of the reproducible component. 
  #  p: mixing proportion of the reproducible component. 

  # Output:
  #  Log-likelihood of the bivariate 2-component Gaussian mixture models
  #  $(1-p)N(0, 0, 1, 1, 0) + N(mu, mu, sigma, sigma, rho)$.
  
  l.m <- sum(d.binormal(z.1, z.2, 0, 1, 0)+log(p*exp(d.binormal(z.1, z.2, mu, sigma, rho)-d.binormal(z.1, z.2, 0, 1, 0))+(1-p)))
  
  return(l.m) 
}


m.step.2normal <-
function(z.1, z.2, e.z){

  # Internal function
  # Maximization step in the EM algorithm for parameterized bivariate
  # 2-component Gaussian mixture models with $(1-p)N(0, 0, 1, 1, 0) +
  # pN(mu, mu, sigma^2, sigma^2, rho)$.
  #
  # Input:
  #  z.1: a numerical data vector on coordinate 1.
  #  z.2: a numerical data vector on coordinate 2.
  #  e.z: a vector of expected conditional probability that the $i$th
  #  observation is reproducible. 
  #
  # Output:
  #  Estimated parameters, basically a list including elements 
  #  p: the estimated mixing proportion of the reproducible component.
  #  mu: the estimated mean for the reproducible component.
  #  sigma: the estimated standard deviation of the reproducible component.
  #  rho: the estimated correlation coefficient of the reproducible component.
  
  
  p <- mean(e.z)
  mu <- sum((z.1+z.2)*e.z)/2/sum(e.z) 
  sigma <- sqrt(sum(e.z*((z.1-mu)^2+(z.2-mu)^2))/2/sum(e.z))
  rho <- 2*sum(e.z*(z.1-mu)*(z.2-mu))/(sum(e.z*((z.1-mu)^2+(z.2-mu)^2)))

  return(list(p=p, mu=mu, sigma=sigma, rho=rho))
}

select.IDR <-
  function(x, IDR.x, IDR.level){

    is.selected <- IDR.x < IDR.level
    x.selected <- x[is.selected,]
    n.selected <- nrow(x.selected)
    
    return(list(x=x.selected, n=n.selected, IDR.level=IDR.level))
  }
Any scripts or data that you put into this service are public.
idr documentation built on June 21, 2022, 9:05 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
idr
Irreproducible Discovery Rate

R/all.R
In idr: Irreproducible Discovery Rate

Defines functions select.IDR m.step.2normal loglik.2binormal get.pseudo.mix e.step.2normal d.binormal comp.uri get.uri.2d get.correspondence select.IDR est.IDR

Documented in comp.uri d.binormal e.step.2normal est.IDR get.correspondence get.pseudo.mix get.uri.2d loglik.2binormal m.step.2normal select.IDR

Try the idr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

idr Irreproducible Discovery Rate

R/all.R In idr: Irreproducible Discovery Rate

Defines functions select.IDR m.step.2normal loglik.2binormal get.pseudo.mix e.step.2normal d.binormal comp.uri get.uri.2d get.correspondence select.IDR est.IDR

Documented in comp.uri d.binormal e.step.2normal est.IDR get.correspondence get.pseudo.mix get.uri.2d loglik.2binormal m.step.2normal select.IDR

Try the idr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

idr
Irreproducible Discovery Rate

R/all.R
In idr: Irreproducible Discovery Rate