Nothing
globalVariables("pvalues")
##' Bayesian colocalisation analysis, detailed output
##'
##' This function replicates coloc.abf, but outputs more detail for
##' further processing using coloc.process
##'
##' Intended to be called internally by coloc.signals
##'
##' @title Bayesian colocalisation analysis with detailed output
##' @inheritParams coloc.abf
##' @return a list of three \code{data.tables}s:
##' \itemize{
##' \item summary is a vector giving the number of SNPs analysed, and the posterior probabilities of H0 (no causal variant), H1 (causal variant for trait 1 only), H2 (causal variant for trait 2 only), H3 (two distinct causal variants) and H4 (one common causal variant)
##' \item df is an annotated version of the input data containing log Approximate Bayes Factors and intermediate calculations, and the posterior probability SNP.PP.H4 of the SNP being causal for the shared signal
##' \item df3 is the same for all 2 SNP H3 models
##' }
##' @author Chris Wallace
##' @seealso \code{\link{coloc.process}}, \code{\link{coloc.abf}}
coloc.detail <- function(dataset1, dataset2, MAF=NULL,
p1=1e-4, p2=1e-4, p12=1e-5) {
if(!is.list(dataset1) || !is.list(dataset2))
stop("dataset1 and dataset2 must be lists.")
if(!("MAF" %in% names(dataset1)) & !is.null(MAF))
dataset1$MAF <- MAF
if(!("MAF" %in% names(dataset2)) & !is.null(MAF))
dataset2$MAF <- MAF
df1 <- as.data.table(process.dataset(d=dataset1, suffix="df1"))[!is.na(lABF.df1),]
df2 <- as.data.table(process.dataset(d=dataset2, suffix="df2"))[!is.na(lABF.df2),]
if("position" %in% names(df1) && "position" %in% names(df2))
df <- merge(df1,df2,by=c("snp","position"))
else
df <- merge(df1,df2,by="snp")
if(!nrow(df))
stop("dataset1 and dataset2 should contain the same snps in the same order, or should contain snp names through which the common snps can be identified")
df$lABF.h4 <- with(df, lABF.df1 + lABF.df2)
## add SNP.PP.H4 - post prob that each SNP is THE causal variant for a shared signal
my.denom.log.abf <- logsum(df$lABF.h4)
df$SNP.PP.H4 <- exp(df$lABF.h4 - my.denom.log.abf)
df3 <- as.data.table(expand.grid(snp1=df$snp,snp2=df$snp))
df3 <- merge(df3,df1[,.(snp,lABF.df1)],by.x="snp1",by.y="snp")
df3 <- merge(df3,df2[,.(snp,lABF.df2)],by.x="snp2",by.y="snp")
df3[,lABF.h3:=lABF.df1+lABF.df2]
setnames(df,c("lABF.df1","lABF.df2"),c("lABF.h1","lABF.h2"),skip_absent=TRUE)
##############################
pp.abf <- combine.abf(df$lABF.h1, df$lABF.h2, p1, p2, p12)
common.snps <- nrow(df)
results <- c(nsnps=common.snps, pp.abf)
list(summary=results,
results=df,
results.H3=df3,
priors=c(p1=p1,p2=p2,p12=p12)
)
}
##' Internal helper function for finemap.signals
##'
##' @title find the next most significant SNP, masking a list of sigsnps
##' @param D dataset in standard coloc format
##' @param LD named matrix of r
##' @param r2thr mask all snps with r2 > r2thr with any in sigsnps
##' @param sigsnps names of snps to mask
##' @return named numeric - Z score named by snp
##' @author Chris Wallace
map_mask <- function(D,LD,r2thr=0.01,sigsnps=NULL) {
## make nicer data table
if("beta" %in% names(D) && "varbeta" %in% names(D)) {
x <- as.data.table(D[c("beta","varbeta","snp","MAF")])
x[,z:=beta/sqrt(varbeta)]
} else {
x <- as.data.table(D[c("pvalues","snp","MAF")])
x[,z:=qnorm(pvalues/2,lower.tail=FALSE)]
}
use <- rep(TRUE,nrow(x))
if(!is.null(sigsnps)) {
expectedz <- rep(0,nrow(x))
friends <- apply(abs(LD[,sigsnps,drop=FALSE])>sqrt(r2thr),1,any,na.rm=TRUE)
use <- use & !friends
if(!any(use))
return(NULL)
imask <- match(sigsnps,x$snp)
expectedz <- LD[,sigsnps,drop=FALSE] %*% abs(x$z[imask])
zdiff <- abs(x$z[use]) - abs(expectedz[use])
} else {
zdiff <- abs(x$z)
}
wh <- which.max(zdiff)
## zthr = qnorm(pthr/2,lower.tail=FALSE)
## if(zdiff[wh] > zthr)
structure(x$z[use][wh],names=x$snp[use][wh])
## else
## NULL
}
##' Internal helper function for est_all_cond
##'
##' @title generate conditional summary stats
##' @param x coloc dataset
##' @param YY sum((Y-Ybar)^2)
##' @param sigsnps names of snps to jointly condition on
##' @param xtx optional, matrix X'X where X is the genotype matrix. If
##' not available, will be estimated from LD, MAF, beta and sample
##' size (the last three should be part of the coloc dataset)
##' @inheritParams map_cond
##' @return data.table giving snp, beta and varbeta on remaining snps
##' after conditioning
##' @author Chris Wallace
est_cond <- function(x,LD,YY,sigsnps,xtx=NULL) {
LD <- LD[x$snp,x$snp]
nuse <- match(sigsnps,x$snp) # to be conditioned on
## to find new beta for, conditional on nuse, excluding SNPs in
## r2>0.8 with nuse because small inaccuracies can blow up, and we
## generally don't expect a second detectable signal in such high
## LD with a first signal
use <- !(rownames(LD) %in% sigsnps | apply(LD[nuse,,drop=FALSE]^2,2,max)>0.8)
check_dataset(x,req=c("beta","varbeta","MAF"))
## Estimating X'X is the key
if(!is.null(xtx))
XX <- xtx
else {
if(x$type=="quant") {
## if quant, assume MAF is correct estimate for sample
VX <- 2 * x$MAF * (1-x$MAF) # expected variance of X, h_buf in GCTA, Dj/N in Yang et al
XX <- sum(x$N) * LD * sqrt(matrix(VX,ncol=1) %*% matrix(VX,nrow=1)) # E(XtX/N)
} else {
## if case control, LD and MAF is from controls, need to adjust
VW <- VMAF.cc(x$MAF,x$beta,N0=sum((1-x$s)*x$N),N1=sum(x$s*x$N))
XX <- sum(x$N) * LD * sqrt(matrix(VW,ncol=1) %*% matrix(VW,nrow=1))
}
}
D <- diag(XX)
D1 <- D[nuse]
D2 <- D[use]
XX1=XX[nuse,nuse,drop=FALSE]
XX2=XX[use,use,drop=FALSE]
XX12=XX[nuse,use,drop=FALSE]
XX21=XX[use,nuse,drop=FALSE]
## accomodate nuse/use of length 1 or more
D1 <- if(length(nuse)==1) {
matrix(D1,nrow=1,ncol=1)
} else {
diag(D1)
}
D2 <- if(sum(use)==1) {
diag(D2,nrow=1,ncol=1)
} else {
diag(D2)
}
# joint effects at nuse, if needed
b1 <- solve(XX1) %*% D1 %*% matrix(x$beta[nuse],ncol=1)
## conditional effects 2 | 1
b2 <- x$beta[use] - XX21 %*% solve(XX1) %*% D1 %*% matrix(x$beta[nuse],ncol=1) / diag(D2)
## residual and conditional b2 variance
Sc <- c(YY -
matrix(b1,nrow=1) %*% D1 %*% matrix(x$beta[nuse],ncol=1)) # ok
Sc <- ( Sc - b2 * diag(D2) * x$beta[use] ) / (sum(x$N) - length(sigsnps) - 1)
vb2 <- c(Sc)*(diag(D2) -
diag(XX21 %*% solve(XX1) %*% XX12)) / diag(D2)^2
vb2 <- abs(c(vb2)) ## abs here because very occasionally can get a small negative vb2 due to approximation. In this case, replace by a positive vb2 of same small magnitude
## na <- apply(abs(LD[sigsnps,use,drop=FALSE])==1,2,any)
rbind(data.table(snp=x$snp[use],beta=c(b2),varbeta=vb2),
data.table(snp=x$snp[!use],
beta=rep(0,sum(!use)), # no association
varbeta=x$varbeta[!use]))
## if(any(na))
## ret[na,c("beta","varbeta"):=list(NA,NA)]
}
est_cond.nometa <- function(x,LD,YY,sigsnps,xtx=NULL) {
LD <- LD[x$snp,x$snp]
nuse <- match(sigsnps,x$snp) # to be conditioned on
use <- !(rownames(LD) %in% sigsnps | apply(LD[nuse,,drop=FALSE]^2,2,max)>0.9) # to find new beta for, conditional on nuse, excluding SNPs in r2>0.9 with nuse because small inaccuracies can blow up, and we generally don't expect a second detectable signal in such high LD with a first signal
check_dataset(x,req=c("beta","varbeta","MAF"))
## Estimating X'X is the key
if(!is.null(xtx))
XX <- xtx
else {
if(x$type=="quant") {
## if quant, assume MAF is correct estimate for sample
VX <- 2 * x$MAF * (1-x$MAF) # expected variance of X, h_buf in GCTA, Dj/N in Yang et al
XX <- x$N * LD * sqrt(matrix(VX,ncol=1) %*% matrix(VX,nrow=1)) # ok
} else {
## if case control, LD and MAF is from controls, need to adjust
VW <- VMAF.cc(x$MAF,x$beta,N0=(1-x$s)*x$N,N1=x$s*x$N)
XX <- x$N * LD * sqrt(matrix(VW,ncol=1) %*% matrix(VW,nrow=1))
}
}
D <- diag(XX)
D1 <- D[nuse]
D2 <- D[use]
XX1=XX[nuse,nuse,drop=FALSE]
XX2=XX[use,use,drop=FALSE]
XX12=XX[nuse,use,drop=FALSE]
XX21=XX[use,nuse,drop=FALSE]
## accomodate nuse/use of length 1 or more
D1 <- if(length(nuse)==1) {
matrix(D1,nrow=1,ncol=1)
} else {
diag(D1)
}
D2 <- if(sum(use)==1) {
diag(D2,nrow=1,ncol=1)
} else {
diag(D2)
}
# joint effects at nuse, if needed
b1 <- solve(XX1) %*% D1 %*% matrix(x$beta[nuse],ncol=1)
## conditional effects 2 | 1
b2 <- x$beta[use] - XX21 %*% solve(XX1) %*% D1 %*% matrix(x$beta[nuse],ncol=1) / diag(D2)
## residual and conditional b2 variance
Sc <- c(YY -
matrix(b1,nrow=1) %*% D1 %*% matrix(x$beta[nuse],ncol=1)) # ok
Sc <- ( Sc - b2 * diag(D2) * x$beta[use] ) / (x$N - length(sigsnps) - 1)
vb2 <- c(Sc)*(diag(D2) -
diag(XX21 %*% solve(XX1) %*% XX12)) / diag(D2)^2
vb2 <- abs(c(vb2)) ## abs here because very occasionally can get a small negative vb2 due to approximation. In this case, replace by a positive vb2 of same small magnitude
## na <- apply(abs(LD[sigsnps,use,drop=FALSE])==1,2,any)
rbind(data.table(snp=x$snp[use],beta=c(b2),varbeta=vb2),
data.table(snp=x$snp[!use],
beta=rep(0,sum(!use)), # no association
varbeta=x$varbeta[!use]))
## if(any(na))
## ret[na,c("beta","varbeta"):=list(NA,NA)]
}
##' Internal helper function for finemap.signals
##'
##' @title find the next most significant SNP, conditioning on a list
##' of sigsnps
##' @param D dataset in standard coloc format
##' @param LD named matrix of r
##' @param YY sum(y^2)
##' @param sigsnps names of snps to mask
##' @return named numeric - Z score named by snp
##' @author Chris Wallace
map_cond <- function(D,LD,YY,sigsnps=NULL) {
## make nicer data table
if(is.null(sigsnps)) { # take max unconditional abs(z)
Z <- D$beta/sqrt(D$varbeta)
wh <- which.max(abs(Z))
return(structure(Z[wh],names=D$snp[wh]))
}
## ignore any SNPs in complete LD with sigsnps - results in non invertible matrices
## LD <- LD[D$snp,D$snp]
## wh <- setdiff(which(apply(abs(LD[sigsnps,,drop=FALSE])>0.99,2,any)),
## which(rownames(LD) %in% sigsnps))
## if(length(wh)) {
## nm <- intersect(names(D),c("snp","position","MAF","beta","varbeta"))
## D[nm] <- lapply(D[nm], function(x) x[-wh])
## LD <- LD[-wh,-wh]
## }
est <- est_cond(D,LD,YY,sigsnps)
Z <- est$beta/sqrt(est$varbeta)
wh <- which.max(abs(Z))
structure(Z[wh],names=est$snp[wh])
}
##' Internal helper function
##'
##' @title Pick out snp with most extreme Z score
##' @param D standard format coloc dataset
##' @return z at most significant snp, named by that snp id
##' @author Chris Wallace
find.best.signal <- function(D) {
## make nicer data table
## x <- as.data.table(D[c("beta","varbeta","snp","MAF")])
## x[,z:=beta/sqrt(varbeta)]
if(D$type=="cc" & D$method=="cond") {
message("approximating linear analysis of binary trait")
D <- bin2lin(D)
}
YY=if(D$type=="quant") {
sum(D$N * D$sdY^2)
} else {
sum(D$N * D$s * (1 - D$s))/sum(D$N)
}
z <- D$beta/sqrt(D$varbeta)
zdiff <- abs(z)
wh <- which.max(zdiff)
structure(z[wh],names=D$snp[wh])
}
##' This is an analogue to finemap.abf, adapted to find multiple
##' signals where they exist, via conditioning or masking - ie a
##' stepwise procedure
##'
##' @title Finemap multiple signals in a single dataset
##' @param D list of summary stats for a single disease, see
##' \link{check_dataset}
##' @param LD matrix of signed r values (not rsq!) giving correlation between
##' SNPs
##' @param method if method="cond", then use conditioning to coloc multiple
##' signals. The default is mask - this is less powerful, but safer because it
##' does not assume that the LD matrix is properly allelically aligned to
##' estimated effect
##' @param r2thr if mask==TRUE, all snps will be masked with r2 > r2thr with any
##' sigsnps. Otherwise ignored
##' @param sigsnps SNPs already deemed significant, to condition on or mask,
##' expressed as a numeric vector, whose *names* are the snp names
##' @param pthr when p > pthr, stop successive searching
##' @param maxhits maximum depth of conditioning. procedure will stop if p >
##' pthr OR abs(z)<zthr OR maxhits hits have been found.
##' @param return.pp if FALSE (default), just return the hits. Otherwise return vectors of PP
##' @param mask use masking if TRUE, otherwise conditioning. defaults to TRUE
##' @export
##' @return list of successively significant fine mapped SNPs, named by the SNPs
##' @author Chris Wallace
finemap.signals <- function(D,LD=D$LD,
method=c("single","mask","cond"),
r2thr=0.01,
sigsnps=NULL,
pthr=1e-6,
maxhits=3,
return.pp=FALSE) {
method <- match.arg(method)
if(method=="cond") {
check_dataset(D,req=c("N","MAF","beta","varbeta"))
check_ld(D,LD)
} else {
check_dataset(D)
}
if(!is.null(sigsnps) && is.null(names(sigsnps)))
stop("sigsnps should be a named numeric vector, with snp ids as the names")
if(!all(names(sigsnps) %in% D$snp))
stop("not all sigsnps found in D$snp")
if(D$type=="quant" & !("sdY" %in% names(D)))
D$sdY <- with(D, sdY.est(vbeta=varbeta, maf=MAF, n=N))
zthr=qnorm(pthr/2,lower.tail = FALSE)
## make nicer data table
## x <- as.data.table(D[c("beta","varbeta","snp","MAF")])
## x[,z:=beta/sqrt(varbeta)]
D$LD <- LD[D$snp,D$snp]
D.orig=D
if(D$type=="cc" & method=="cond") {
message("approximating linear analysis of binary trait")
D <- bin2lin(D)
}
YY=if(D$type=="quant") {
sum(D$N * D$sdY^2)
} else {
sum(D$N * D$s * (1 - D$s))
}
## check_ld in matching order
hits <- NULL
while(length(hits)<maxhits) {
newhit=if(method=="mask") {
map_mask(D,LD,r2thr,sigsnps=names(hits))
} else {
map_cond(D=D,LD=LD, YY=YY, sigsnps=names(hits))
}
if(is.null(newhit) || !length(newhit) || abs(newhit) < zthr )
break
hits <- c(hits,newhit)
if(method=="single") # stop after one
break
}
if(!return.pp)
return(hits)
cond_results=est_all_cond(D.orig, hits, mode="cond")
xtra <- D[ intersect(names(D), c("N","sdY","type","s")) ]
finemap_results=lapply(cond_results, function(cond) {
finemap.abf(c(cond, xtra))
})
result=finemap_results[[1]]
if(length(finemap_results)>1) {
for(k in 2:length(finemap_results)) {
finemap_results[[k]]=finemap_results[[k]][,c("snp","lABF.","SNP.PP"),drop=FALSE]
colnames(finemap_results[[k]])[c(2,3)]=paste0(colnames(finemap_results[[k]])[c(2,3)],k,sep="")
result=merge(result,finemap_results[[k]],by="snp",all=TRUE)
}
}
attr(result,"conditioned")=names(finemap_results)
result
}
gethits <- function(hits,i,mode) {
if(mode=="allbutone") { # mask everything else
ret <- hits[-i]
} else { # mask everything earlier in list
if(i==1)
return(character(0))
else
ret <- hits[1:(i-1)]
}
setdiff(ret,"")
}
##' Internal helper function
##'
##' @title Post process a coloc.details result using masking
##' @param obj object returned by coloc.detail()
##' @param hits1 lead snps for trait 1. If length > 1, will use
##' masking
##' @param hits2 lead snps for trait 2. If length > 1, will use
##' masking
##' @param LD named LD matrix (for masking)
##' @param r2thr r2 threshold at which to mask
##' @param LD1 named LD matrix (for masking) for trait 1 only
##' @param LD2 named LD matrix (for masking) for trait 2 only
##' @param mode either "iterative" (default) - successively condition
##' on signals or "allbutone" - find all putative signals and
##' condition on all but one of them in each analysis
##' @inheritParams coloc.abf
##' @return data.table of coloc results
##' @author Chris Wallace
coloc.process <- function(obj,hits1=NULL,hits2=NULL,LD=NULL,r2thr=0.01,p1=1e-4,p2=1e-4,p12=1e-6,LD1=LD,LD2=LD,
mode=c("iterative","allbutone")) {
mode <- match.arg(mode)
## which format?
## if("results" %in% names(result)) { # for legacy objects
## obj$results <- obj$results
## setnames(obj$results3,c("snpA","snpB"),c("snp1","snp2"))
## }
setnames(obj$results,paste0("lABF.h",c(1,2,4)),paste0("lbf",c(1,2,4)),skip_absent=TRUE)
setnames(obj$results.H3,paste0("lABF.h",3),paste0("lbf",3),skip_absent=TRUE)
## overall coloc on a given set of snp results
.f <- function(df,df3) {
lH0.abf <- 0
lH1.abf <- log(p1) + logsum(df$lbf1)
lH2.abf <- log(p2) + logsum(df$lbf2)
lH3.abf <- log(p1) + log(p2) + logsum(df3$lbf3)
lH4.abf <- log(p12) + logsum(df$lbf4)
all.abf <- c(lH0.abf, lH1.abf, lH2.abf, lH3.abf, lH4.abf)
best1 <- df[which.max(lbf1),]$snp
best2 <- df[which.max(lbf2),]$snp
best4 <- df[which.max(lbf4),]$snp
my.denom.log.abf <- logsum(all.abf)
pp.abf <- c(as.list(exp(all.abf - my.denom.log.abf)),list(best1=best1,best2=best2,best4=best4))
names(pp.abf) <- c(paste("PP.H", (1:(length(pp.abf)-3)) - 1, ".abf", sep = ""),
"best1","best2","best4")
as.data.table(cbind(nsnps=nrow(df),as.data.frame(pp.abf,stringsAsFactors=FALSE)))
}
if(is.null(hits1))
hits1 <- ""
if(is.null(hits2))
hits2 <- ""
## one signal per trait
if(length(hits1)<=1 & length(hits2)<=1) {
tmp <- .f(obj$results,obj$results.H3)
tmp$hit1=hits1
tmp$hit2=hits2
## setnames(obj$results,"H4","SNP.PP.H4")
## obj$results$SNP.PP.H4 <- with(obj$results, exp(lbf4 - logsum(lbf4)))
## newresult[[ paste0("z.df1.row",r) ]] <- ifelse(df$snp %in% drop1, 0, df$z.df1)
## newresult[[ paste0("z.df2.row",r) ]] <- ifelse(df$snp %in% drop2, 0, df$z.df2)
return(list(summary=tmp,
results=obj$results,
priors=c(p1,p2,p12)))
}
## otherwise mask
## - allbutone : all but one signal from each trait with >1 signal
## - iterative : successively one more signal from each trait with >1 signal
ldfriends1 <- LD1[setdiff(unique(c(hits1)),""),,drop=FALSE]^2
ldfriends2 <- LD2[setdiff(unique(c(hits2)),""),,drop=FALSE]^2
todo <- expand.grid(i=seq_along(hits1),j=seq_along(hits2))
todo <- t(todo) #[i<=j,])
newresult <- obj$results[,intersect(c("snp","position"),
colnames(obj$results)),
drop=FALSE,with=FALSE]
ret <- vector("list",ncol(todo))
for(r in 1:ncol(todo)) {
i <- todo[1,r]
j <- todo[2,r]
message(r)
## ldin <- apply(ldfriends[ setdiff(c(hits1[ i ],hits2[j]),""), ,drop=FALSE ],2,max)
drop1 <- drop2 <- NULL
if(length(hits1)>1) {
ihits1 <- gethits(hits1,i,mode)
if(!length(ihits1))
drop1 <- character(0)
else {
ldout1 <- apply(ldfriends1[ ihits1 , , drop=FALSE ],2,max)
drop1 <- colnames(ldfriends1)[ldout1 > r2thr]
}
}
if(length(hits2)>1) {
ihits2 <- gethits(hits2,j,mode)
if(!length(ihits2))
drop2 <- character(0)
else {
ldout2 <- apply(ldfriends2[ ihits2, , drop=FALSE ],2,max)
drop2 <- colnames(ldfriends2)[ldout2 > r2thr]
}
}
## ld1 <- apply(ldfriends[ setdiff(hits1[-i],""), , drop=FALSE ],2,max)
## ld2 <- apply(ldfriends[ setdiff(hits2[-i],""), , drop=FALSE ],2,max)
## drop1 <- colnames(ldfriends)[ld1 > r2thr]
## drop2 <- colnames(ldfriends)[ld2 > r2thr]
dropsnps <- unique(c(drop1,drop2))
message("dropping ",length(dropsnps),"/",nrow(obj$results)," : ",length(drop1)," + ",length(drop2))
if(length(setdiff(obj$results$snp, dropsnps)) <= 1)
return(NULL)
df <- copy(obj$results)
df3 <- copy(obj$results.H3)
df[snp %in% drop1, c("lbf1","lbf4"):=list(-1.1,-1.1)]
df[snp %in% drop2, c("lbf2","lbf4"):=list(-1.1,-1.1)]
## PP | H4
my.denom.log.abf <- logsum(df$lbf4)
newresult[[ paste0("SNP.PP.H4.row",r) ]] <- exp(df$lbf4 - my.denom.log.abf)
newresult[[ paste0("z.df1.row",r) ]] <- ifelse(df$snp %in% drop1, 0, df$z.df1)
newresult[[ paste0("z.df2.row",r) ]] <- ifelse(df$snp %in% drop2, 0, df$z.df2)
## df[snp %in% drop2, c("lbf2","lbf4"):=list(log(1/1000),log(1/1000))]
## df3[snp1 %in% drop1, lABF.df1:=0]
## df3[snp2 %in% drop2, lABF.df2:=0]
df3[snp1 %in% drop1 | snp2 %in% drop2,lbf3:=0]
## mask <- masks[[ todo[r,1] ]], mask2[[ todo[r,2] ]]),"")
## friends <- apply(abs(LD[masks[,,drop=FALSE]) > sqrt(r2thr),2,any)
## dropsnps <- rownames(LD)[which(friends)]
## cbind(.f(obj$results[ !(snp %in% dropsnps), ],
## obj$results3[ !(snp1 %in% dropsnps | snp2 %in% dropsnps), ]),
tmp <- cbind(.f(df, df3))
tmp$hit1=hits1[todo[1,r] ]
tmp$hit2=hits2[ todo[2,r] ]
ret[[r]] <- copy(tmp)
}
return(list(summary=rbindlist(ret),
results=newresult,
priors=c(p1=p1,p2=p2,p12=p12)))
}
##' Estimate single snp frequency distibutions
##'
##' @title estgeno1
##' @return relative frequency of genotypes 0, 1, 2
##' @author Chris Wallace
##' @param f MAF
##' @rdname estgeno1
##' @seealso estgeno2
estgeno.1.ctl <- function(f) {
c((1-f)^2,2*f*(1-f),f^2)
}
## vectorized version
vestgeno.1.ctl <- function(f) {
cbind((1-f)^2,2*f*(1-f),f^2)
}
##' @param G0 single snp frequency in controls (vector of length 3) -
##' obtained from estgeno.1.ctl
##' @param b log odds ratio
##' @rdname estgeno1
estgeno.1.cse <- function(G0,b) {
g0 <- 1
g1 <- exp( b - log(G0[1]/G0[2]))
g2 <- exp( 2*b - log(G0[1]/G0[3]))
c(g0,g1,g2)/(g0+g1+g2)
}
## vectorized version
vestgeno.1.cse <- function(G0,b) {
g0 <- 1
g1 <- exp( b + log(G0[,2]) - log(G0[,1]))
g2 <- exp( 2*b + log(G0[,3]) - log(G0[,1]))
cbind(g0,g1,g2)/(g0+g1+g2)
}
VMAF.cc <- function(f0,b,N0,N1) {
G0 <- vestgeno.1.ctl(f0)*N0
G1 <- vestgeno.1.cse(G0,b)*N1
G <- G0 + G1
E2 <- (G %*% matrix(c(0,1,4),ncol=1))/(N0+N1)
E1 <- (G %*% matrix(c(0,1,2),ncol=1))/(N0+N1)
(E2 - E1^2) * (N0+N1) / (N0+N1-1)
}
bin2lin.nometa <- function(D,doplot=FALSE) {
if(D$type!="cc")
stop("type != cc")
## est maf in cases
g0 <- vestgeno.1.ctl(D$MAF) # genotype freq in controls
g1 <- vestgeno.1.cse(g0,D$beta) # genotype freq in cases
ex0 <- tcrossprod(c(0,1,2),g0) # E(x) | control
ex1 <- tcrossprod(c(0,1,2),g1) # E(x) | case
ex <- (1 - D$s) * ex0 + D$s * ex1 # E(x)
vx0 <- tcrossprod(c(0,1,4),g0) # E(x^2) | control
vx1 <- tcrossprod(c(0,1,4),g1) # E(x^2) | case
vx <- (1 - D$s) * vx0 + D$s * vx1 - ex^2 # V(x) = E(x^2) | E(x)=0
vy <- D$s * (1-D$s) # V(y) = E(y^2) | E(y)=0 (centered)
## vxy <- D$s*ex1 - ex * D$s # E(xy)
vxy <- D$s * (1-D$s) * (ex1-ex0) # E(xy) | E(x) = E(y) = 0
D1 <- D
## D1$beta.bin <- D1$beta
## D1$varbeta.bin <- D1$varbeta
D1$beta <- c(vxy/vx)
D1$varbeta <- c(vy/vx - vxy^2/vx^2)/(D$N-1)
##check
z1 <- D1$beta/sqrt(D1$varbeta)
z <- D$beta/sqrt(D$varbeta)
mod <- lm( z1 ~ z - 1 )
message("quality of linear approximation (ideal is 1): ",round(coef(mod),4))
if(coef(mod) > 1.1 | coef(mod) < 0.9) {
warning("linear approx quality is not good")
}
if(doplot==TRUE) {
plot(D$beta/sqrt(D$varbeta),D1$beta/sqrt(D1$varbeta),xlab="Logistic z",ylab="Approximated linear z"); abline(0,1); abline(0,coef(mod),lty=2,col="red")
legend("topleft",lty=2,col="red",legend=round(coef(mod),4))
}
D1$quality <- coef(mod)
D1
}
wmean <- function(x,w) {
if(is.matrix(w))
colSums(x*w)/colSums(w)
else
colSums(x*w)/sum(w)
}
##' Convert binomial to linear regression
##'
##' Estimate beta and varbeta if a linear regression had been run on a
##' binary outcome, given log OR and their variance + MAF in controls
##'
##' sets beta = cov(x,y)/var(x) varbeta = (var(y)/var(x) -
##' cov(x,y)^2/var(x)^2)/N
##' @title binomial to linear regression conversion
##' @param D standard format coloc dataset
##' @param doplot plot results if TRUE - useful for debugging
##' @return D, with original beta and varbeta in beta.bin,
##' varbeta.bin, and beta and varbeta updated to linear estimates
##' @author Chris Wallace
bin2lin <- function(D,doplot=FALSE) {
if(D$type!="cc")
stop("type != cc")
## est maf in cases
g0 <- vestgeno.1.ctl(D$MAF) # genotype freq in controls
g1 <- vestgeno.1.cse(g0,D$beta) # genotype freq in cases
ex0 <- tcrossprod(c(0,1,2),g0) # E(x) | control
ex1 <- tcrossprod(c(0,1,2),g1) # E(x) | case
ex <- crossprod(t((1 - D$s)),ex0) + crossprod(t(D$s), ex1) # E(x) | strata
vx0 <- tcrossprod(c(0,1,4),g0) # E(x^2) | control
vx1 <- tcrossprod(c(0,1,4),g1) # E(x^2) | case
vx <- crossprod(t(1 - D$s),vx0) + crossprod(t(D$s), vx1) - ex^2
vy <- D$s * (1-D$s) # V(y) = E(y^2) | E(y)=0 (centered) | strata
## vxy <- D$s*ex1 - ex * D$s # E(xy)
vxy <- crossprod(t(vy),(ex1-ex0)) # E(xy) | E(x) = E(y) = 0 | strata
D1 <- D
## D1$beta.bin <- D1$beta
## D1$varbeta.bin <- D1$varbeta
varbeta <- (vy/vx - vxy^2/vx^2) / (D$N-1)
D1$varbeta <- 1/colSums(1/varbeta)
D1$beta <- wmean(vxy/vx, 1/varbeta)
##check
z1 <- D1$beta/sqrt(D1$varbeta)
z <- D$beta/sqrt(D$varbeta)
mod <- lm( z1 ~ z - 1 )
message("quality of linear approximation (ideal is 1): ",round(coef(mod),4))
if(coef(mod) > 1.1 | coef(mod) < 0.9) {
warning("linear approx quality is not good")
}
if(doplot==TRUE) {
plot(D$beta/sqrt(D$varbeta),D1$beta/sqrt(D1$varbeta),xlab="Logistic z",ylab="Approximated linear z"); abline(0,1); abline(0,coef(mod),lty=2,col="red")
legend("topleft",lty=2,col="red",legend=round(coef(mod),4))
}
D1$quality <- coef(mod)
D1
}
est_all_cond <- function(D,FM,mode) {
if(D$type=="cc")
D <- bin2lin(D)
D$z <- D$beta/sqrt(D$varbeta)
if(length(FM)==1) {
cond <- as.data.table(D[intersect(c("beta","varbeta","snp","MAF","position","z"),names(D))])
return(structure(list(cond), names=names(FM)))
}
YY=if(D$type=="quant") {
sum(D$N * D$sdY^2)
} else {
sum(D$N * D$s * ( 1 - D$s ))
}
if(mode=="allbutone") {
sigs=lapply(seq_along(FM), function(i) {
names(FM)[-i]
})
} else {
sigs <- lapply(1:(length(FM)+1), function(i) {
if(i==1)
NULL
else
names(FM)[1:(i-1)]
})
}
cond <- lapply(sigs, function(sigsnps) {
if(is.null(sigsnps))
as.data.table(D[intersect(c("beta","varbeta","snp","MAF","position","z"),names(D))])
else
est_cond(D,D$LD,YY=YY, sigsnps=sigsnps)
})
names(cond) <- if(mode=="allbutone") {
names(FM)
} else {
sapply(sigs,paste,collapse="+")
}
cond
}
## merge_cond <- function(cond,D) {
## m <- match(cond$snp,D$snp)
## for(nm in intersect(names(D), c("MAF","position")))
##' New coloc function, builds on coloc.abf() by allowing for multiple
##' independent causal variants per trait through conditioning or
##' masking.
##'
##' @title Coloc with multiple signals per trait
##' @inheritParams coloc.abf
##' @param LD required if method="cond". matrix of genotype
##' *correlation* (ie r, not r^2) between SNPs. If dataset1 and
##' dataset2 may have different LD, you can instead add LD=LD1 to
##' the list of dataset1 and a different LD matrix for dataset2
##' @param method default "" means do no conditioning, should return
##' similar to coloc.abf. if method="cond", then use conditioning
##' to coloc multiple signals. if method="mask", use masking to
##' coloc multiple signals. if different datasets need different
##' methods (eg LD is only available for one of them) you can set
##' method on a per-dataset basis by adding method="..." to the
##' list for that dataset.
##' @param mode "iterative" or "allbutone". Easiest understood with
##' an example. Suppose there are 3 signal SNPs detected for
##' trait 1, A, B, C and only one for trait 2, D.
##'
##' Under "iterative" mode, 3 coloc will be performed:
##' * trait 1 - trait 2
##' * trait 1 conditioned on A - trait 2
##' * trait 1 conditioned on A+B - trait 2
##'
##' Under "allbutone" mode, they would be
##' * trait 1 conditioned on B+C - trait 2
##' * trait 1 conditioned on A+C - trait 2
##' * trait 1 conditioned on A+B - trait 2
##'
##' Only iterative mode is supported for method="mask".
##'
##' The allbutone mode is optimal if the signals are known with
##' certainty (which they never are), because it allows each
##' signal to be tested without influence of the others. When
##' there is uncertainty, it may make sense to use iterative mode,
##' because the strongest signals aren't affected by conditioning
##' incorrectly on weaker secondary and less certain signals.
##' @param maxhits maximum number of levels to condition/mask
##' @param r2thr if masking, the threshold on r2 should be used to
##' call two signals independent. our experience is that this
##' needs to be set low to avoid double calling the same strong
##' signal.
##' @param pthr if masking or conditioning, what p value threshold to
##' call a secondary hit "significant"
##' @export
##' @return data.table of coloc results, one row per pair of lead snps
##' detected in each dataset
##' @author Chris Wallace
coloc.signals <- function(dataset1, dataset2,
MAF=NULL, LD=NULL,
method=c("single","cond","mask"),
mode=c("iterative","allbutone"),
p1=1e-4, p2=1e-4, p12=NULL, maxhits=3, r2thr=0.01,
pthr = 1e-06) {
method <- match.arg(method)
mode <- match.arg(mode)
if(is.null(p12))
stop("default value for p12 has been removed. please read ... and choose a value appropriate for your study")
## error checking
if(!("MAF" %in% names(dataset1)) & !is.null(MAF))
dataset1$MAF <- MAF
if(!("MAF" %in% names(dataset2)) & !is.null(MAF))
dataset2$MAF <- MAF
if(!("LD" %in% names(dataset1)) & !is.null(LD))
dataset1$LD <- LD
if(!("LD" %in% names(dataset2)) & !is.null(LD))
dataset2$LD <- LD
if(!("method" %in% names(dataset1)) & !is.null(method))
dataset1$method <- method
if(!("method" %in% names(dataset2)) & !is.null(method))
dataset2$method <- method
if(dataset1$method=="cond") {
check_dataset(dataset1,1,req=c("beta","varbeta","MAF"))
check_ld(dataset1,dataset1$LD)
if(dataset1$type=="quant" & !("sdY" %in% names(dataset1)))
dataset1$sdY <- with(dataset1, sdY.est(vbeta=varbeta, maf=MAF, n=N))
} else {
check_dataset(dataset1,1)
}
if(dataset2$method=="cond") {
check_dataset(dataset2,2,req=c("beta","varbeta","MAF"))
check_ld(dataset2,dataset2$LD)
if(dataset2$type=="quant" & !("sdY" %in% names(dataset2)))
dataset2$sdY <- with(dataset2, sdY.est(vbeta=varbeta, maf=MAF, n=N))
} else {
check_dataset(dataset2,2)
}
zthr = qnorm(pthr/2,lower.tail=FALSE)
## fine map
fm1 <- finemap.signals(dataset1,
method=dataset1$method,
maxhits=maxhits,r2thr=r2thr,pthr=pthr)
fm2 <- finemap.signals(dataset2,
method=dataset2$method,
maxhits=maxhits,r2thr=r2thr,pthr=pthr)
if(is.null(fm1))
fm1 <- find.best.signal(dataset1)
## warning("no signal with p <",pthr," in dataset1")
if(is.null(fm2))
fm2 <- find.best.signal(dataset2)
## warning("no signal with p <",pthr," in dataset1")
## if(dataset1$method=="single")
## fm1 <- fm1[1]
## if(dataset2$method=="single")
## fm2 <- fm2[1]
## conditionals if needed
if(!is.null(fm1) && dataset1$method=="cond") {
cond1 <- est_all_cond(dataset1,fm1,mode=mode)
X1 <- dataset1[ intersect(names(dataset1), c("N","sdY","type","s")) ]
}
if(!is.null(fm2) && dataset2$method=="cond") {
cond2 <- est_all_cond(dataset2,fm2,mode=mode)
X2 <- dataset2[ intersect(names(dataset2), c("N","sdY","type","s")) ]
}
## double mask/-
if(dataset1$method %in% c("single","mask") & dataset2$method %in% c("single","mask")) {
col <- coloc.detail(dataset1,dataset2, p1=p1,p2=p2,p12=p12)
res <- coloc.process(col,
hits1=names(fm1), hits2=names(fm2),
LD1=dataset1$LD, LD2=dataset2$LD, r2thr=r2thr,
mode=mode,p12=p12)
}
## double cond
if(dataset1$method=="cond" & dataset2$method=="cond") {
todo <- expand.grid(i=seq_along(cond1),j=seq_along(cond2))
res <- vector("list",nrow(todo))
for(k in 1:nrow(todo)) {
col <- coloc.detail(dataset1=c(cond1[[ todo[k,"i"] ]],X1),
dataset2=c(cond2[[ todo[k,"j"] ]],X2),
p1=p1,p2=p2,p12=p12)
res[[k]] <- coloc.process(col,
hits1=names(fm1)[ todo[k,"i"] ],
hits2=names(fm2)[ todo[k,"j"] ],
LD1=dataset1$LD, LD2=dataset2$LD,p12=p12)
}
}
## ## double -
## if(dataset1$method=="single" && dataset2$method=="single") {
## col <- coloc.detail(dataset1,
## dataset2,
## p1=p1,p2=p2,p12=p12)
## res <- coloc.process(col,
## hits1=names(fm1)[1],
## hits2=names(fm2)[1],
## LD1=dataset1$LD, LD2=dataset2$LD,r2thr=r2thr)
## }
## cond mask/-
if(dataset1$method=="cond" && dataset2$method %in% c("single","mask")) {
res <- vector("list",length(cond1))
for(k in 1:length(res)) {
col <- coloc.detail(c(cond1[[ k ]],X1),
dataset2,
p1=p1,p2=p2,p12=p12)
res[[k]] <- coloc.process(col,
hits1=names(fm1)[ k ],
hits2=names(fm2),
LD1=dataset1$LD, LD2=dataset2$LD,r2thr=r2thr,p12=p12) #, ...)
}
}
## mask/- cond
if(dataset1$method %in% c("single","mask") && dataset2$method=="cond") {
res <- vector("list",length(cond2))
for(k in 1:length(res)) {
col <- coloc.detail(dataset1,
c(cond2[[ k ]],X2),
p1=p1,p2=p2,p12=p12)
res[[k]] <- coloc.process(col,
hits1=names(fm1),
hits2=names(fm2)[ k ],
LD1=dataset1$LD, LD2=dataset2$LD,r2thr=r2thr,p12=p12) #, ...)
}
}
## ## cond-mask
## if(dataset1$method=="cond" && dataset2$method=="mask") {
## res <- vector("list",length(cond1))
## for(k in 1:length(res)) {
## col <- coloc.detail(c(cond1[[ k ]],X1),
## dataset2,
## p1=p1,p2=p2,p12=p12)
## res[[k]] <- coloc.process(col,
## hits1=names(fm1)[ k ],
## hits2=names(fm2),
## LD1=dataset1$LD, LD2=dataset2$LD,r2thr=r2thr) #, ...)
## }
## }
## ## mask-cond
## if(dataset1$method=="mask" && dataset2$method=="cond") {
## res <- vector("list",length(cond2))
## for(k in 1:length(res)) {
## col <- coloc.detail(dataset1,
## c(cond2[[ k ]],X2),
## p1=p1,p2=p2,p12=p12)
## res[[k]] <- coloc.process(col,
## hits1=names(fm1),
## hits2=names(fm2)[ k ],
## LD1=dataset1$LD, LD2=dataset2$LD,r2thr=r2thr) #, ...)
## }
## }
## post-process
if(dataset1$method=="cond" || dataset2$method=="cond") {
summary <- rbindlist(lapply(res,"[[","summary"))
rowvars <- c("SNP.PP.H4","z.df1","z.df2") # replicated per row
results <- res[[1]]$results
setnames(results,rowvars,paste0(rowvars,".row1"),skip_absent=TRUE)
for(i in setdiff(1:length(res),1)) {
thisone <- res[[i]]$results[,grep("snp|SNP.PP.H4|z.df1|z.df2",
names(res[[i]]$results)),
drop=FALSE,with=FALSE]
setnames(thisone,rowvars,paste0(rowvars,".row",i),skip_absent=TRUE)
results <- merge(results,thisone,by="snp",all=TRUE)
}
} else {
summary <- res[["summary"]]
results <- res[["results"]]
}
## ## if cond-cond, will have lost position
## if(dataset1$method=="cond" && dataset2$method=="cond" &&
## ("position" %in% names(dataset1))) {
## m <- match(results$snp,dataset1$position)
## results <- cbind(snp=results$snp, results[,-1],
## position=dataset1$position[m])
## }
d1 <- data.table(hit1=names(fm1),hit1.margz=c(fm1))
res <- merge(summary,d1,by="hit1")
d2 <- data.table(hit2=names(fm2),hit2.margz=c(fm2))
res <- merge(res,d2,by="hit2")
ret <- list(summary=res,
results=results,
priors=c(p1=p2,p2=p2,p12=p12))
class(ret) <- c("coloc_abf",class(ret))
ret
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.