R/cFDR.R

Defines functions cfdr exp_quantile trap exp_quantile_point cor_shared c2a fit.em plotred

Documented in c2a cfdr cor_shared exp_quantile exp_quantile_point fit.em

##' Compute estimated cFDR values from a set of pairs of p values, or adjusted p-values if controls are shared.
##'
##' For a set of p-values or adjusted p values \eqn{P_{j}}{P_i} for the principal phenotype, and a set of p values \eqn{P_{j}}{P_j} for the conditional phenotype, the cFDR for SNP \eqn{x} is estimated as
##' 
##' \deqn{
##'   \widehat{cFDR}(x)=P_{i}(x) \frac{\textrm{(# SNPs with $P_{j} \leq P_{j}(x)$}{\textrm{(# SNPs with $P_{i} \leq P_{i}(x), P_{j} \leq P_{j}(x))}}}
##' }{
##'   est. cFDR(x) = P_i(x) * (# P_j \le P_j(x)) / (# P_i \le P_i(x) AND P_j \le P_j(x))
##' }
##'
##' The procedure can be sped up by specifying a subset of SNPs at which to estimate the cFDR. This is most effective if a boundary of the subset can be written as \eqn{P_{j} = f(P_{i})}{P_j = f(P_i)}, where \eqn{f} is non-increasing. Equivalently, if \eqn{S} is the set of all p-value pairs, the subset \eqn{S'} should satisfy the condition
##' 
##' \deqn{
##'   \forall \left( p_{i}',p_{j}' \in S', p_{i},p_{j} \in S \right) \: p_{i} \leq  p_{i}', p_{j} \leq p_{j}' \implies p_{i},p_{j} \in S'
##' }{
##'   for all (p_i',p_j') in S', (p_i,p_j) in S:   p_i \le p_i', q_j \le p_j   =>    (p_i,p_j) in S'.
##' }
##'
##' An example of such a subset is the set of \eqn{p_{i},p_{j}}{p_i,p_j} with \eqn{p_{i}^2 + p_{j}^2 < \alpha}{p_i^2 + p_j^2 < \alpha}.
##' 
##' The estimated cFDR is a conservative estimate (upper bound) on the quantity 
##' 
##' \deqn{
##'   Pr(H_{0}^{i} | P_{i}<P_{i}(x),P_{j}<P_{j}(x))
##'  }{
##'   Pr(H0 | P_i<P_i(x),P_j<P_j(x))
##'  }
##' 
##' where \eqn{H_{0}^{i}}{H0} is the null hypothesis at SNP \eqn{x} for the principal phenotype (ie, SNP \eqn{x} is not associated with the principal phenotype)
##' 
##' @title cfdr
##' @param P_i vector of p-values for the principal phenotype.
##' @param P_j vector of p-values or adjusted p-values for the conditional phenotype. If controls are shared between GWAS, the p-values must be adjusted using the function \code{\link{exp_quantile}}.
##' @param sub set of indices; only compute cFDR at this subset of p-value pairs. Typically, only SNPs with low values of P_i or P_j are of interest; it is possible to increase the speed of the procedure by only calculating cFDR at such SNPs.
##' @export
##' @author James Liley
##' @examples
##' require(mnormt)
##' P = 2*pnorm(-abs(rmnorm(10000,varcov=diag(c(0.8,0.8))+0.2)))
##' P_i = P[,1]; P_j=P[,2]
##' W = which((qnorm(P_i/2)^2) + (qnorm(P_j/2)^2) > 4)
##'
##' # Using function
##' C = cfdr(P_i,P_j,sub=W); C[W[1]]
##' 
##' # Explicit computation
##' P_i[W[1]]/ (length(which (P_i <= P_i[1] & P_j <= P_j[1])) / length(which (P_j <= P_j[1])) )
##'

cfdr = function(P_i,P_j,sub=1:length(P_i)) {
  
  # various error handlers
  if (!is.numeric(P_i) | !is.numeric(P_j) || (max(P_i,na.rm=TRUE)>1) || (max(P_j,na.rm=TRUE)>1) || (min(P_i,na.rm=TRUE)<0) || (min(P_j, na.rm=TRUE)<0)) stop("Arguments P_i and P_j must both be vectors of p values or adjusted p values")
  if (!(all(sub %in% 1:length(P_i)) | all(sub %in% names(P_i)))) stop("Parameter sub must be a valid list of indices of P_i")
  
  # check sub, add warning
  
  
  C=rep(NA,length(P_i)); names(C)=names(P_i)
  wx=rep(0,length(P_i)); wx[sub]=1;
  
  ww=which(!is.na(P_i+P_j))
  P_i=P_i[ww]; P_j=P_j[ww]
  wx=wx[ww]; subx=which(wx==1)
  
  # For each P_i,P_j pair in 'sub', count the number of SNPs with smaller P_i and P_j values
  R_i=rank(P_i); R_jx=rank(P_j)
  o=order(R_i)
  wx=wx[o]; suby=which(wx==1)
  R_j=R_jx[o]
  N=length(P_i)
  maxl=rep(1,N)
  out=rep(1,N)
  for (i in 1:length(suby)) {
    out[suby[i]]=maxl[R_j[suby[i]]]
    maxl[R_j[suby[i]]:N]=1+maxl[R_j[suby[i]]:N]
  }
  
  # Divide by number of SNPs with smaller P_j values, then divide P_j values by the result to get cFDR.
  Cx=rep(1,length(ww))
  Cx[subx]=P_i[subx]/(out[order(o)][subx]/R_jx[subx])
  
  C[ww]=Cx
  C
}





##' Transform principal p-values for estimating cFDR in a shared control design. Computes the 'expected quantile' of a p value.
##'
##' Computes the probability that a p value at some SNP for a phenotype \eqn{i} is less than some cutoff \eqn{p_{i}}{p_i} given that the p value at that SNP for a second phenotype \eqn{j} is less than a cutoff \eqn{p_{j}}{p_j}, under the null hypothesis that the SNP is not associated with phenotype \eqn{i}.
##'
##' If the GWAS for phenotypes \eqn{i} and \eqn{j} share a number of controls, effect sizes for the two phenotypes will be correlated, even at null SNPs. This leads to dependence of the expected quantile both on the number of shared controls and study sizes (through paramter \code{\link{rho}}), and the distribution of Z values for phenotype \eqn{j} across all SNPs, not all of which will necessarily be null for phenotype \eqn{j}.
##'
##' The distribution of Z scores for phenotype \eqn{j} can be specified in two ways, depending on the parameter method. In the first case, we assume that the distribution of 'true' Z scores (Z scores which we would observe if the observed allele frequencies exactly matched the population allele frequencies) has a mixture distribution of 0 with probability \code{\link{pi0}}, and N(0,\code{\link{sigma}}^2) with probability 1-\code{\link{pi0}}. These parameters can be estimated from an observed distribution of Z scores using the function \code{\link{fit.em}} .
##'
##' In the second case, the distribution of true Z scores can be specified as an array of Z scores and corresponding probabilities (or PDF heights).
##'
##' @title exp_quantile
##' @param p Observed p value or set of p values for principal phenotype
##' @param pc Observed p value or set of p values for conditional phenotype. Must be of the same length as \code{\link{p}}.
##' @param rho Correlation between z values induced by shared controls
##' @param  method Set to 1 to assume Z scores for the conditional phenotype follow a mixed distribution characterised by \code{\link{pi0}}, \code{\link{sigma}}; 2 to specify assume they follow empirical distribution characterised by \code{\link{dist}}. 
##' @param pi0 Relevant only if \code{\link{method}} = 1. Parameter for distribution of Z scores for the conditional phenotype (proportion of SNPs null for conditional phenotype). 
##' @param sigma Relevant only if \code{\link{method}} = 1. Parameter for distribution of Z scores for the conditional phenotype (variance of true effect sizes for SNPs non-null for conditional phenotype).
##' @param dist Relevant only if \code{\link{method}} = 2. Matrix with two rows; first is a list of potential values for Z scores for conditional phenotype, second is a list of probabilities associated with those values.
##' @return A list of transformed p values (expected quantiles) of the same length as \code{\link{p}}.
##' @export
##' @author James Liley
##' @examples
##' p <- 0.2; pc <- 0.1;
##' rho <- 0.25
##' sigma <- 5; pi0 <- 0.95
##' exp_quantile(p,pc,rho,method=1,pi0=pi0,sigma=sigma)
##' # Generally the expected quantile is close to p. In this case, the SNP is 'probably' null for phenotype j, so the MAF in controls is somewhat aberrant from expected (as pc<0.1). This means that the expected quantile is higher than the p value.
##'
##' p<-0.2; pc <-1e-8;
##' exp_quantile(p,pc,rho,method=1,pi0=pi0,sigma=sigma)
##' # If a low value of pc is observed, it is 'likely' to have arisen from a non-null SNP for phenotype j. This p value is quite reasonable for a non-null SNP, given the distribution of z values for such SNPs (sigma=5), so the aberration from expected MAF amongst controls is minimal.
##'
##' pi0=1
##' exp_quantile(p,pc,rho,method=1,pi0=pi0,sigma=sigma)
##' # If, on the other hand, pi0=1 (no non-null SNPs for phenotype j), and we happen to observe a p value of 1e-8 for the conditional phenotype, we conclude that MAF in controls is likely to be very aberrant from expected, so the expected quantile is markedly higher than the p value.
##'
##' rho <- 0
##' exp_quantile(p,pc,rho,method=1,pi0=pi0,sigma=sigma)
##' # If rho=0 (no shared controls) z values for the two phenotypes are independent, and the expected quantile equals parameter p.
##'
##' p3 = rbind((-500:500)/20, dnorm((-500:500)/20,sd=3))
##' exp_quantile(5e-4,5e-3,0.3,method=2,pi0=0.9,dist=p3)
##' exp_quantile(5e-4,5e-3,0.3,method=1,pi0=0.9,sigma=3)
##' # Demonstrates specification of an empirical distribution for Z scores for the conditional phenotype.
##' 
##' 

exp_quantile = function(p,pc,rho,method=1,pi0=0.5,sigma=1,dist = rbind((1:10)/10,dnorm((1:10)/10,sd=3))) {
  
  require(pbivnorm)
  
  if (rho==0) return(p) else {
    
    z = -qnorm(p/2)
    zc = -qnorm(pc/2);
    
    if (method==1) {
      num = (pi0*pbivnorm(-z,-zc,rho=rho)) + (pi0*pbivnorm(-z,-zc,rho=-rho)) + ((1-pi0)*pbivnorm(-z,-zc/sqrt(1+(sigma^2)),rho=rho/sqrt(1+(sigma^2)))) +  ((1-pi0)*pbivnorm(-z,-zc/sqrt(1+(sigma^2)),rho=-rho/sqrt(1+(sigma^2))))
      denom = (pi0*pnorm(-zc)) + (1-pi0)*pnorm(-zc/sqrt(1+(sigma^2)))
      return(num/denom)
    }
    if (method==2) {
      dist[,2]=dist[,2]/(trap(dist[,1],dist[,2]))
      num = matrix(0,length(z), dim(dist)[2]); denom=num;
      for (i in 1:dim(dist)[2]) {
        num[,i] = pbivnorm(-z,-zc-dist[1,i],rho=rho) + pbivnorm(-z,-zc+dist[1,i],rho=rho) + pbivnorm(-z,-zc-dist[1,i],rho=-rho) + pbivnorm(-z,-zc+dist[1,i],rho=-rho)
        denom[,i] = pnorm(-zc-dist[1,i]) + pnorm(-zc+dist[1,i])
      }
      num_null = 2*((pbivnorm(-z,-zc,rho=rho) + pbivnorm(-z,-zc,rho=-rho))); denom_null = 2*pnorm(-zc); 
      num_nonnull=num_null; denom_nonnull=denom_null; # initialise
      for (i in 1:length(z)) {
        num_nonnull[i] = trap(dist[1,],num[i,]*dist[2,]); 
        denom_nonnull[i] = trap(dist[1,],denom[i,]*dist[2,])
      }
      return(((pi0*num_null) + ((1-pi0)*(num_nonnull)))/((pi0*denom_null) + ((1-pi0)*(denom_nonnull))))
    }
  }
}

# Trapezoid rule
trap = function(x,y) {
  0.5*((c(x,0)-c(0,x))[2:(length(x))] %*% (c(y,0)+c(0,y))[2:(length(y))])
}




##' Evaluate the conditional expected quantile of a p value in the shared control design conditional on a p value for a second phenotype
##'
##' Computes the probability that a p value for a phenotype \eqn{i} is less than some cutoff \code{\link{p}} given that the p value for a second phenotype \eqn{j} is equal to a value \code{\link{pc}}, under the null hypothesis for phenotype \eqn{i}.
##'
##' If the GWAS for phenotypes \eqn{i} and \eqn{j} share a number of controls, effect sizes for the two phenotypes will be correlated, even at null SNPs. This leads to dependence of the expected quantile both on the number of shared controls and study sizes (through paramter \code{\link{rho}}), and the distribution of Z scores for phenotype \eqn{j} across all SNPs, not all of which will necessarily be null for phenotype \eqn{j}.
##'
##' The 'true' Z scores (Z scores which we would observe if the observed allele frequencies exactly matched the population allele frequencies) are assumed to follow a mixture distribution of 0 with probability \code{\link{pi0}}, and N(0,\code{\link{sigma}}^2) with probability 1-\code{\link{pi0}}. These parameters can be estimated from an observed distribution of Z scores using the function \code{\link{fit.em}} .
##'
##' The value of the function is the first derivative of the function \code{\link{exp_quantile}} with respect to \eqn{P_[j]}{P_j}.
##'
##' @title exp_quantile_point
##' @param p Observed p value or set of p values for principal phenotype
##' @param pc Observed p value or set of p values for conditional phenotype. Must be of the same length as \code{\link{p}}.
##' @param rho Correlation between Z scores induced by shared controls. Output from \code{\link{cor_shared}}.
##' @param pi0 Parameter for distribution of Z scores for the conditional phenotype (proportion of SNPs null for conditional phenotype).
##' @param sigma Parameter for distribution of Z scores for the conditional phenotype (variance of true effect sizes for SNPs non-null for conditional phenotype).
##' @return A list of probabilities of the same length as \code{\link{p}}.
##' @export
##' @author James Liley
##' @examples
##' p <- 0.2; pc <- 0.1;
##' rho <- 0.25
##' sigma <- 5; pi0 <- 0.95
##' exp_quantile_point(p,pc,rho,pi0=pi0,sigma=sigma)
##' # Generally the expected quantile is close to p. 

exp_quantile_point = function(p,pc,rho,pi0=1,sigma=1) {
  
  require(pbivnorm)
  
  z = -qnorm(p/2)
  zc = -qnorm(pc/2);
  
  num = (dnorm(-zc)*pi0*
           (pnorm(-z,mean = -rho*zc,sd = sqrt(1-(rho^2)))+pnorm(-z,mean = rho*zc,sd = sqrt(1-(rho^2))))) + 
    (dnorm(-zc,sd = sqrt(1+(sigma^2)))*(1-pi0)*
       (pnorm(-z,mean = -rho*zc/sqrt(1+(sigma^2)),sd = sqrt(1-(rho^2)+(sigma^2))/sqrt(1+(sigma^2))) +
          pnorm(-z,mean = rho*zc/sqrt(1+(sigma^2)),sd = sqrt(1-(rho^2)+(sigma^2))/sqrt(1+(sigma^2)))))
  denom = ((pi0*dnorm(-zc)) + (1-pi0)*dnorm(-zc,sd = sqrt(1+(sigma^2))))
  
  return(num/denom)
}




##' Compute correlation between z values arising from shared controls
##'
##' Uses a formula from Zaykin et al (2010)
##'
##' @title cor_shared
##' @param n0i Number of controls in GWAS for principal phenotype
##' @param n0j Number of controls in GWAS for conditional phenotype
##' @param n1i Number of cases in GWAS for principal phenotype
##' @param n1j Number of cases in GWAS for conditional phenotype
##' @param overlap Number of controls shared between studies
##' @return Asymptotic correlation between z scores.
##' @export
##' @author James Liley
##' @examples
##'
##' n0i = 1500; n0j = 1200; n1i = 800; n1j = 700;
##' overlap = 1000;
##' cor_shared(n0i,n0j,n1i,n1j,overlap)
##'
##'
##' # Simulation
##' n_snps = 10000; sim_maf = runif(n_snps,min=0.1,max=0.5) # simulated MAF
##' sim_con_i = matrix(runif(n_snps*n0i),n_snps,n0i)<sim_maf # simulated genotypes for controls for phenotype i
##' sim_con_j = matrix(runif(n_snps*n0j),n_snps,n0j)<sim_maf; sim_con_j[,1:overlap]=sim_con_i[,1:overlap] # simulated genotypes for controls for phenotype j, with shared controls
##' sim_case_i = matrix(runif(n_snps*n1i),n_snps,n1i)<sim_maf; sim_case_j = matrix(runif(n_snps*n1j),n_snps,n1j)<sim_maf # simulated genotypes for cases
##' om0i=rowMeans(sim_con_i); om0j=rowMeans(sim_con_j); om1i = rowMeans(sim_case_i); om1j = rowMeans(sim_case_j) # observed minor allele frequencies
##' or_i = om0i*(1-om1i)/(om1i*(1-om0i)); or_j = om0j*(1-om1j)/(om1j*(1-om0j)) # Odds ratios
##' se_i = sqrt( (1/(2*om0i*n0i)) + (1/(2*(1-om0i)*n0i)) + (1/(2*om1i*n1i)) + (1/(2*(1-om1i)*n1i)) )
##' se_j = sqrt( (1/(2*om0j*n0j)) + (1/(2*(1-om0j)*n0j)) + (1/(2*om1j*n1j)) + (1/(2*(1-om1j)*n1j)) ) # Standard errors
##' effect_i = log(or_i)/se_i; effect_j = log(or_j)/se_j # Effect sizes
##' 
##' cor(effect_i,effect_j)
##' cor_shared(n0i,n0j,n1i,n1j,overlap)
##' 
##' cor(abs(effect_i),abs(effect_j))
##' cor_shared(n0i,n0j,n1i,n1j,overlap)^2

cor_shared = function(n0i, n0j, overlap, n1i, n1j) {
  
  N01 = n0i-overlap; N02=n0j-overlap
  sqrt(1/( (1+(N02/overlap))*(1+(N01/overlap))*(1+(N01/n1i)+(overlap/n1i))*(1+(N02/n1j)+(overlap/n1j)) ));
  
}




##' Compute an upper bound on the false discovery rate amongst SNPs with cFDR less than some cutoff \eqn{\alpha} .
##' 
##' Bound is based on estimating the area of the region of the unit square containing all potential p-value pairs \eqn{p_{i},p_{j}}{p_i,p_j} such that \eqn{\widehat{cFDR}(p_{i},p_{j}) \leq \alpha}{est. cFDR(p_i,p_j) \le \alpha}. It is typically conservative.
##' 
##' Computation requires parametrisation of the joint distribution of Z scores for the conditional phenotype at SNPs null for the principal phenotype. This is assumed to be mixture-Gaussian, consisting of N(0,I_2) with probability \code{\link{pi0}} and N(0,(1,rho; rho,\code{\link{sigma}}^2)) with probability 1-\code{\link{pi0}}. Values of \code{\link{pi0}} and \code{\link{sigma}} can be obtained from the fitted parameters of the null model usign the function \code{\link{fit.em}}.
##' 
##' The probability is computed using a numerical integral over the (+/+) quadrant and the range and resolution of the integral can be set.
##' @title c2a
##' @param P_i vector of p-values for the principal phenotype.
##' @param P_j vector of p-values or adjusted p-values for the conditional phenotype. If controls are shared between GWAS, the p-values must be adjusted using the function \code{\link{exp_quantile}}.
##' @param cutoffs vector of cFDR cutoffs at which to compute overall FDR
##' @param pi0 proportion of SNPs not associated with conditional phenotype
##' @param sigma standard deviation of observed conditional Z scores in SNPs associated with the conditional phenotype
##' @param rho covariance between principal and conditional Z scores arising due to shared controls; output from \code{\link{cor_shared}}.
##' @param xmax compute integral over [0,\code{\link{xmax}}] x [0,\code{\link{ymax}}] as approximation to [0,inf] x [0,inf]
##' @param ymax compute integral over [0,\code{\link{xmax}}] x [0,\code{\link{ymax}}] as approximation to [0,inf] x [0,inf]
##' @param res compute integral at gridpoints with this spacing.
##' @param vector of FDR values corresponding to \code{\link{cutoffs}}
##' @author James Liley
##' @export
##' @return list of FDR values
##' @examples
##' nn=100000
##' 
##' Z=abs(rbind(rmnorm(0.8*nn,varcov=diag(2)), rmnorm(0.15*nn,varcov=rbind(c(1,0),c(0,2^2))), rmnorm(0.05*nn,varcov=rbind(c(3^2,2),c(2,4^2)))));
##' P=2*pnorm(-abs(Z))
##' 
##' X=cfdr(P[,1],P[,2],sub=which(Z[,1]^2 + Z[,2]^2 > 6))
##' Xm=which(X<0.05); Xsub=Xm[order(runif(length(Xm)))[1:100]] # sample of cFDR values 
##' 
##' true_fdr=rep(0,100); for (i in 1:100) true_fdr[i]=length(which(X[1:(0.95*nn)] <= X[Xsub[i]]))/length(which(X<=X[Xsub[i]])) # true FDR values (empirical)
##' fdr=c2a(P[,1],P[,2],X[Xsub],pi0=0.95,sigma=2) # estimated FDR using area method
##' 
##' plot(true_fdr,fdr,xlab="True FDR",ylab="Estimated",col="red"); points(true_fdr,X[Xsub],col="blue"); abline(0,1); legend(0.1*max(true_fdr),0.7*max(fdr),c("Area method", "cFDR"),col=c("red", "blue"),pch=c(1,1)) # cFDR underestimates true FDR; area method gives good approximation.
##'
c2a=function(P_i,P_j,cutoffs,pi0=0.5,sigma=1,rho=0,xmax=12,ymax=12,res=0.01) {

  if (!is.numeric(P_i) | !is.numeric(P_j) || (max(P_i,na.rm=TRUE)>1) || (max(P_j,na.rm=TRUE)>1) || (min(P_i,na.rm=TRUE)<0) || (min(P_j, na.rm=TRUE)<0)) stop("Arguments P_i and P_j must both be vectors of p values or adjusted p values")
  Z=-qnorm(cbind(P_i,P_j)/2)
  
  if (!is.numeric(pi0) | !is.numeric(sigma) || (pi0>1) | (pi0<0) | (sigma<0)) stop("Parameters pi0 and sigma must be set; pi0 must be in [0,1] and sigma must be non-negative. ")
  
  gx=seq(0,xmax,res); gy=seq(0,ymax,res); gridx=matrix(gx,length(gx),length(gy)); gridy=t(matrix(gy,length(gy),length(gx))); 
  Zx=cbind(as.numeric(gridx),as.numeric(gridy))
  
  ww=which(Z[,1]^2 + Z[,2]^2 > 3.5^2)
  qn=matrix(0,length(gx),length(gy)) # qn[i,j] is set to the number of elements of Z with Z[,1]>gx[i] and Z[,2]>gx[j]
  e1=round(ecdf(gx)(Z[ww,1])*length(gx)); e2=round(ecdf(gy)(Z[ww,2])*length(gy)); nx=length(gx); ny=length(gy)
  for (ii in 1:length(ww)) {
    qn[1:e1[ii],1:e2[ii]]=qn[1:e1[ii],1:e2[ii]]+1 # block out lower-left rectangle for every point
  }
  pj=exp_quantile(2*pnorm(-Zx[,1]),2*pnorm(-Zx[,2]),rho=rho,pi0=pi0, sigma=sigma) # transformed p value
  qd=round(dim(Z)[1]*(1-ecdf(Z[,2])(Zx[,2]))) # denominator of observed quantile
  cf=pj*(1+qd)/(1+as.vector(qn)); cf[which(Zx[,1]^2 + Zx[,2]^2 <3.6^2)]=1; cf=matrix(cf,length(gx),length(gy))
  
  kk=4*((pi0*dmnorm(Zx,varcov=diag(2)))+ ((1-pi0)*dmnorm(Zx,varcov=rbind(c(1,rho),c(rho,sigma^2))))) # probability distribution of null SNPs
  
  ycut=round(ecdf(cf)(cutoffs)*length(cf)) # number of cf values less than each value of cutoffs
  null_L=cumsum(kk[order(cf)])[ycut]/sum(kk) # integrate pdf of null SNPs over region with cfdr less than cutoffs. Expected proportion of null SNPs lying in region with cfdr < cut.
  
  # xij is defined such that xij[i,j]=probability of null SNP falling in region x>gx[i],y>gy[j]
  xij=t(apply(apply(matrix(kk,length(gx),length(gy))[length(gx):1,length(gy):1], 2, cumsum), 1, cumsum))[length(gx):1,length(gy):1]
  maxm=  cummax(as.numeric(xij)[order(cf)])[ycut]/sum(kk) # maxm[j] is the maximum expected number of null SNPs in a region (x>xx,y>yy) with cfdr(xx,yy)<cutoffs[j]
  
  cutoffs*null_L/maxm # final FDR is bounded above by the ratio of expected number of SNPs in L divided by the expected number of SNPs in M
  
}







##' Fit a specific two Guassian mixture distribution to a set of Z values.
##'
##' Assumes 
##' Z ~ N(0,1) with probability  \code{\link{pi0}}, Z ~ N(0,1 + \code{\link{sigma}}^2) with probability 1-\code{\link{pi0}}
##'
##' We define 'true' Z scores as the Z scores that would be obtained if MAF for both groups exactly matched the corresponding MAFs in the population, or equivalently the expected values of Z scores. If 'true' Z scores are distributed following a 'spike and tail' model of 0 with probability \code{\link{pi0}} and N(0,\code{\link{sigma}}^2) with probability 1-\code{\link{pi0}}, then observed Z scores follow the above distribution.
##' @title fit.em
##' @param Z numeric vector of observed data
##' @param sigma_init initial value for \code{\link{sigma}}
##' @param pi0_init initial value for \code{\link{pi0}}
##' @param tol how small a change lhood prompts continued optimization
##' @param maxit maximum number of iterations
##' @return a list containing fitted \code{\link{pi0}}, fitted \code{\link{sigma}}, and a record of fitted parameters at eachs stage of the E-M algorithm.
##' @export
##' @author Chris Wallace, James Liley
##' @examples
##' sigma <- 2
##' pi0 <- 0.8
##' n <- 10000; n0=round(pi0*n); n1=n-n0
##' Z <- c(rnorm(n0,0,1),rnorm(n1,0,sqrt(1+ (sigma^2))))
##' fit<-fit.em(Z)
##' fit$pi0
##' fit$sigma
##' fit$history
fit.em <- function(Z,pi0_init=0.9,sigma_init=1,tol=1e-4,verbose=TRUE,maxit=1e4) {
  
  Z=Z[which(is.finite(Z))]
  
  ## probabilities of group0, group1 membership
  p <- c(pi0_init,1-pi0_init)
  px <- matrix(p,length(Z),2,byrow=TRUE)
  
  ## parameter vector
  pars <- c(pi0_init,sigma_init)
  

  pars.fail <- function(pars) if ((pars[1]<0) | (pars[1]>1) | (pars[2]<0)) return(TRUE) else return(FALSE)
  
  ## likelihood for first group
  lh1=function(pi) pi*dnorm(Z)
  
  ## likelihood for second group
  lh2=function(pi,sigma) pi*dnorm(Z,sd=sigma)
  
  ## likelihood function to be maximized
  lhood <- function(pars, sumlog=TRUE) {
   e=lh1(pars[1]) + lh2(1-pars[1],pars[2]) 
   e[which(e==0)] <- 1e-64
   e[which(is.infinite(e))] <- 1e64
   if(sumlog) return(-sum(log(e))) else return(-e)
  }
  
  nit <- 1
  df <- 1
  value <- matrix(NA,maxit,3,dimnames=list(NULL,c("pi0","sigma","lhood")))
  value[nit,] <- c(pars, lhood(pars))
  while(df>tol & nit<maxit) {
    
    nit <- nit+1
    
    ## E step
    px[,1]=lh1(pars[1]);
    px[,2]=lh2(1-pars[1],pars[2])

    px <- px/rowSums(px) ## normalise
    px[is.nan(px)] <- 0
    
    ## M step
    pars[1] = mean(px[,1])
    pars[2] <- sqrt( max(1,sum( px[,2] * Z^2 )) / sum(px[,2]) ) # enforce sigma >= 1
    value[nit,] <- c(pars, lhood(pars))
    df <- abs(value[nit,3] - value[nit-1,3])
    
  }
  return(list(pi0=pars[1], sigma=sqrt(pars[2]^2 - 1),
              history=value[1:nit,]))
}



# Draw plot
plotred=function(p_i,p_j,xmax=10,ymax=10,res=0.05,draw=TRUE,contour=TRUE,
                 points=TRUE,block=2,legend=TRUE,col=heat.colors(100),
                 xlp=0.7*xmax, ylp=0.6*ymax,...) {
  
  xl<<-seq(0,xmax,res); yl<<-seq(0,ymax,res)
  mm=matrix(0,length(xl),length(yl))
  
  for (i in 1:dim(mm)[2]) {
    ww=which(p_j < 10^(-yl[i]))
    if (length(ww)>0) {
      cfsub=(10^(-xl))/((1+ length(ww)*ecdf(p_i[ww])(10^(-xl)))/(1+length(ww)));
      mm[,i]= cfsub
    }else mm[,i]= 10^(-xl)
  }
  
  mm[which(mm>1)]=1
  cmat<<-mm
  
  if (draw) image(xl,yl,mm^(1/4),xaxs="i",yaxs="i",col=col,...)
  if (contour) {
    contour(xl,yl,mm,add=TRUE,col="darkgray",levels=10^(-((1:10)/3)))
    cut=(5e-8) / ((1+length(which(p_i<5e-8)))/(1+length(p_j)))
    contour(xl,yl,mm,add=TRUE,col="black",levels=cut)
  }
  if (points) {
    l_i=-log10(p_i); l_j=-log10(p_j)
    wxy=which(l_i+l_j > block)
    points(l_i[wxy],l_j[wxy],cex=0.5,pch=16)
    polygon(c(0,0,block),c(0,block,0),col="black")
  }
  if (legend) {
    rect(xlp,ylp,xlp+0.15*xmax,ylp+0.3*ymax,col="white")
    col_l=seq(0,1,length.out=length(col))^(1/4)
    image(xlp + c(0.0375,0.0525)*xmax,ylp+seq(0.03*ymax,0.23*ymax,length.out=length(col)),
          as.matrix(rbind(col_l,col_l)),add=TRUE); 
    rect(xlp+0.03*xmax,ylp+0.03*ymax,xlp+ 0.06*xmax,ylp + 0.23*ymax)
    for (i in seq(0,1,length.out=6)) text(xlp+0.1*xmax,ylp+ (0.03+ (0.2*i))*ymax,round(i,digits=1))
  }
}
jamesliley/cFDR-common-controls documentation built on May 18, 2019, 11:21 a.m.