get_pi <- function(alpha, e){
alpha0 <- alpha[1]
alpha1 <- alpha[2]
if (alpha0>10 | alpha1>10){
return(p=1)
}
else{
p=exp(alpha0 + alpha1 * e)/(1 + exp(alpha0 + alpha1 * e))
}
}
q_pos <- function(b, zpost, rate.s0, mut, mutidx,ll.n){
ll.s <- get_ll_s(b,mut,mutidx, rate.s0)
q <- sum(zpost[ ,1] * ll.s + zpost[ ,2] * ll.n)
return(q)
}
dd_loglik <- function(p, rate.s0, ll.n, mutidx, mut, e){
beta0 <- p[1]
alpha <- p[2:3]
pi <- get_pi(alpha, e)
ll.s <- get_ll_s(beta0, mut,mutidx, rate.s0)
ll <- sum(log(pi * exp(ll.s) + (1-pi) * exp(ll.n)))
ll
}
get_ll_s <- function(b, mut, mutidx, rate_s0){
rate_s <- rate_s0 * exp(b)
rmtx <- -rate_s
rmtx[mutidx] <- mut[mutidx]*log(rate_s[mutidx]) - rate_s[mutidx]
# log likelihood for each sample under selection
colSums(rmtx)
}
dd_EM_update <- function(p, rate.n, rate.s0, ll.n, mutidx,type = c("null", "alt"),mut,e){
# p: beta0, alpha
beta0 <- p[1]
alpha <- p[2:3]
# update z_i
ll.s <- get_ll_s(b=beta0,mut=mut,mutidx=mutidx, rate_s0=rate.s0)
pi <- get_pi(alpha=alpha, e=e)
zpost <- cbind(pi * exp(ll.s) , (1-pi) * exp(ll.n)) # 1st column selection, 2nd column neutral
zpost <- zpost/rowSums(zpost)
# update beta0
suppressWarnings(res <- optim(1, q_pos, zpost = zpost, rate.s0 = rate.s0, mut=mut,ll.n = ll.n, mutidx=mutidx, control=list(fnscale=-1))) # init with beta0 = 1.
beta0 <- res$par
# update alpha
if (type == "null"){
lg.x <- rep(1, ncol(mut))
}
if (type == "alt"){
lg.x <- e
}
suppressWarnings(alpha <- brglm::brglm(zpost~ lg.x,family="binomial")$coefficients)
if (type == "null"){
alpha <- c(alpha[1], 0)
}
pnew <- c(beta0, alpha)
return(pnew)
}
dd_EM_ordinary <- function(beta0 = 0, alpha = c(0,0), rate.n, rate.s0, ll.n, mutidx, type = c("null", "alt"), maxit = 100, tol = 1e-3,mut,e){
ll_rec <- rep(0, maxit)
p_rec <- NULL
# initialize
p <- c(beta0, alpha)
for (i in 1:maxit){
ll <- dd_loglik(p, rate.s0, ll.n, mutidx,e)
ll_rec[i] <- ll
cat("iteration ", i,"; loglikelihood:", ll, "\n")
pnew <- dd_EM_update(p, rate.n, rate.s0, ll.n, mutidx, type = type,mut=mut,e=e)
p_rec <- rbind(p_rec, pnew)
if (dist(rbind(pnew, p)) < tol){
break
}
p <- pnew
}
return(list("loglikelihood" = ll, "beta0" = p[1], "alpha" = p[2:3], "loglik_rec" = ll_rec, "param_rec" = p_rec))
}
dd_squarEM <- function(beta0 = 0, alpha = c(0,0), rate.n, rate.s0, ll.n,mutidx, type = c("null", "alt"), mut,e, maxit = 100, tol = 1e-3){
p <- c(beta0, alpha)
# EM
res <- SQUAREM::squarem(p=p, rate.n = rate.n, rate.s0=rate.s0, ll.n=ll.n, mutidx=mutidx, type = type,mut=mut,e=e, fixptfn=dd_EM_update, control=list(tol=tol, maxiter = maxit))
p <- res$par
ll <- dd_loglik(p=p, rate.s0=rate.s0, ll.n=ll.n, mutidx=mutidx,mut=mut,e=e)
return(list("loglikelihood" = ll, "beta0" = p[1], "alpha" = p[2:3]))
}
#' @title diffDriver model
#' @description This model is applied on data of a single gene. It will infer effect size for both sample-level variable and positional level functional annotations. We used an EM algorithm to infer parameters.
#' @param mut a matrix of mutation status 0 or 1, rows positions, columns are samples.
#' @param e a vector,phenotype of each sample,
#' should match the columns of \code{mut} and \code{mr}
#' @param mr a matrix, mutation rate of each sample at each mutation (log scale) that is not dependent on sample level factor
#' @param fe, a vector, increased mutation rate at each position, depending on e (log scale),
#' should match the rows of \code{mut} and \code{mr}
#' @export
ddmodel <- function(mut, e, mr, fe, label, ...){
rate.n <- exp(mr)
rate.s0 <- exp(fe) * rate.n
## aggregate the duplicate rows
rate.n <- aggregate(rate.n,by=list(label),sum,na.rm=T)[,-c(1)]
rate.s0 <- aggregate(rate.s0,by=list(label),sum,na.rm=T)[,-c(1)]
mut <- aggregate(as.matrix(mut),by=list(label),sum,na.rm=T)[,-c(1)]
mutidx <- which(mut!=0, arr.ind = T)
## Parameter estimation based on Poisson likelihood
ll.n <- colSums(mut*log(rate.n)-rate.n)
# initialize
# bi=(nrow(mutidx) - sum(rate.n)/sum(rate.s0)
#beta0.init <- ifelse( bi>0, log(bi),rnorm(1))
beta0.init <- 0
res.null <- dd_squarEM(beta0 = beta0.init, rate.n = rate.n, rate.s0 = rate.s0, ll.n=ll.n,mutidx=mutidx, type = "null",mut=mut,e=e, ...)
res.alt <- dd_squarEM(beta0 = beta0.init, rate.n = rate.n, rate.s0 = rate.s0, ll.n=ll.n,mutidx=mutidx, type = "alt",mut=mut,e=e, ...)
## Get p value
teststat<- -2*(res.null$loglikelihood - res.alt$loglikelihood)
pvalue <- pchisq(teststat, df=1, lower.tail=FALSE)
res <- list("pvalue"=pvalue, "res.null" = res.null, "res.alt"=res.alt)
return(res)
}
#' @title diffDriver model only for binary phenotype.
#' @description This function uses the model as cmodel.frac,
#' but generalizes to take more than 1 functional categories.
#' This model is applied on data of a single gene
#' @param mut a matrix of mutation status 0 or 1
#' @param e a vector,phenotype of each sample,
#' should match the columns of \code{mut} and \code{bmr}
#' @param bmr a matrix, background mutation rate of each sample at each mutation (log scale)
#' @param fe, a vector, increased mutation rate at each mutation, due to functional effect (log scale),
#' should match the rows of \code{mut} and \code{bmr}
#' @export
ddmodel_binary <- function(mut, e, bmr, fe){
fe <- exp(fe)
bmr <- exp(bmr)
mut.t <- t(mut)
mut <- as.matrix(mut)
mut.t <- as.matrix(mut.t)
lln <- function(eta){
# log likelihood under null
b.pi <- exp(eta) * (fe - 1) * bmr + bmr
b.pi[b.pi <= 0] <- 1e-8
b.pi[b.pi >= 1] <- 1 - 1e-8
llmtx <- (1- mut) * log(1 - b.pi) + mut * log(b.pi)
ll <- sum(llmtx,na.rm=T)
return(ll)
}
lla <- function(eta){
# log likelihood under alt
eta0 <- eta[1]
eta1 <- eta[2]
b.pi0 <- t(exp(eta0) * (fe - 1) * bmr + bmr)
b.pi1 <- t(exp(eta1) * (fe - 1) * bmr + bmr)
b.pi0[b.pi0 <= 0] <- 1e-8
b.pi1[b.pi1 <= 0] <- 1e-8
b.pi0[b.pi0 >= 1] <- 1 - 1e-8
b.pi1[b.pi1 >= 1] <- 1 - 1e-8
b.pi <- b.pi0 + e * (b.pi1- b.pi0)
llmtx <- (1- mut.t) * log(1- b.pi) + mut.t * log(b.pi)
ll <- sum(llmtx,na.rm=T)
return(ll)
}
suppressWarnings(resn <- optim(0, lln, method="Nelder-Mead", control=list(fnscale=-1))) # BFGS has Error: non-finite finite-difference value [2]
suppressWarnings(resa <- optim(c(0,0), lla, method="Nelder-Mead", control=list(fnscale=-1)))
probnull <- mean(exp(resn$par)*(fe-1)*bmr+bmr)
probalt1 <- mean(exp(resa$par[1])*(fe-1)*bmr+bmr)
probalt2 <- mean(exp(resa$par[2])*(fe-1)*bmr+bmr)
teststat<- -2*(resn$value-resa$value)
pvalue <- pchisq(teststat,df=1,lower.tail=FALSE)
all<- c(pvalue,resn$par,resa$par,resn$value,resa$value)
names(all)=c("pvalue","null_para","e=0","e=1","null_likelihood","alt_likelihood")
res <- list("pvalue"=pvalue, "null.eta0" = resn$par, "alt.eta"=resa$par, "null.ll"= resn$value, "alt.ll" = resa$value,"all"=all)
return(res)
}
#' @title diffDriver model only for binary phenotype, assuming bmr the same across samples.
#' @description This function uses the model as cmodel.frac,
#' but generalizes to take more than 1 functional categories.
#' This model is applied on data of a single gene. This should give the same results as ddmodel_binary defined above.
#' @param mut a matrix of mutation status 0 or 1
#' @param e a vector,phenotype of each sample,
#' should match the columns of \code{mut} and \code{bmr}
#' @param bmr a matrix, background mutation rate of each sample at each mutation (log scale), as we assume bmr the same across samples, only the first column will be used.
#' @param fe, a vector, increased mutation rate at each mutation, due to functional effect (log scale),
#' should match the rows of \code{mut} and \code{bmr}
#' @export
ddmodel_binary_simple <- function(mut, e, bmr, fe){
fe <- exp(fe)
bmr <- exp(bmr)
mut <- as.matrix(mut)
mutpos <- rowSums(mut)
npos <- ncol(mut) - mutpos
mutpos.e0 <- rowSums(mut[, e==0])
npos.e0 <- ncol(mut[, e==0]) - mutpos.e0
mutpos.e1 <- rowSums(mut[, e==1])
npos.e1 <- ncol(mut[, e==1]) - mutpos.e1
index0=min(which(e==0))
index1=min(which(e==1))
lln <- function(eta){
# log likelihood under null
b.pi0 <- exp(eta) * (fe - 1) * bmr[,index0] + bmr[,index0]
b.pi1 <- exp(eta) * (fe - 1) * bmr[,index1] + bmr[,index1]
b.pi0[b.pi0 <= 0] <- 1e-8
b.pi0[b.pi0 >= 1] <- 1 - 1e-8
b.pi1[b.pi1 <= 0] <- 1e-8
b.pi1[b.pi1 >= 1] <- 1 - 1e-8
llmtx0 <- npos.e0 * log(1- b.pi0) + mutpos.e0 * log(b.pi0)
llmtx1 <- npos.e1 * log(1- b.pi1) + mutpos.e1 * log(b.pi1)
ll <- sum(llmtx0,na.rm=T)+sum(llmtx1,na.rm=T)
return(ll)
}
lla <- function(eta){
# log likelihood under alt
eta0 <- eta[1]
eta1 <- eta[2]
index0=min(which(e==0))
index1=min(which(e==1))
b.pi0 <- t(exp(eta0) * (fe - 1) * bmr[,index0] + bmr[,index0])
b.pi1 <- t(exp(eta1) * (fe - 1) * bmr[,index1] + bmr[,index1])
b.pi0[b.pi0 <= 0] <- 1e-8
b.pi1[b.pi1 <= 0] <- 1e-8
b.pi0[b.pi0 >= 1] <- 1 - 1e-8
b.pi1[b.pi1 >= 1] <- 1 - 1e-8
llmtx0 <- npos.e0 * log(1- b.pi0) + mutpos.e0 * log(b.pi0)
llmtx1 <- npos.e1 * log(1- b.pi1) + mutpos.e1 * log(b.pi1)
ll <- sum(llmtx0) + sum(llmtx1)
return(ll)
}
suppressWarnings(resn <- optim(0, lln, method="Nelder-Mead", control=list(fnscale=-1))) # BFGS has Error: non-finite finite-difference value [2]
suppressWarnings(resa <- optim(c(0,0), lla, method="Nelder-Mead", control=list(fnscale=-1)))
teststat<- -2*(resn$value-resa$value)
pvalue <- pchisq(teststat,df=1,lower.tail=FALSE)
res <- list("pvalue"=pvalue, "null.eta0" = resn$par, "alt.eta"=resa$par, "null.ll"= resn$value, "alt.ll" = resa$value)
return(res)
}
# note this version does not estimate beta. It doesn't use EM, but use optim to estimate alpha.
ddmodel_conti_simple <- function(mut, e, bmr, fe){
fe <- exp(fe)
bmr <- exp(bmr)
mut <- as.matrix(mut)
mut1=data.frame()
mut0=data.frame()
aa=data.frame()
ubmr=unique(bmr)[,1]
n=ncol(bmr)
for (i in l:length(ubmr)){
index=which(bmr[,1]==ubmr[i])
ife=fe[index]
ufe=unique(ife)
for (j in 1:length(ufe)){
index=which(bmr[,1]==ubmr[i] & fe==ufe[j] )
imut=mut[index,]
mut1=rbind(mut1,rowSums(imut))
mut0=rbind(mut0,nrow(imut)-rowSums(imut))
aa=rbind(aa,c(ubmr[i],ubmr[i]*ufe[j]))
}
}
lln <- function(eta){
# log likelihood under null
b.pi0 <- aa[,1]^mut1*(1-aa[,1])^mut0
b.pi1 <- aa[,2]^mut1*(1-aa[,2])^mut0
b.pi0[b.pi0 <= 0] <- 1e-8
b.pi0[b.pi0 >= 1] <- 1 - 1e-8
b.pi1[b.pi1 <= 0] <- 1e-8
b.pi1[b.pi1 >= 1] <- 1 - 1e-8
pi= exp(eta)/(1+exp(eta))
ll <- sum(log(b.pi0+b.pi1),na.rm=T)
return(ll)
}
lla <- function(eta){
# log likelihood under alt
eta0 <- eta[1]
eta1 <- eta[2]
index0=min(which(e==0))
index1=min(which(e==1))
b.pi0 <- t(exp(eta0) * (fe - 1) * bmr[,index0] + bmr[,index0])
b.pi1 <- t(exp(eta1) * (fe - 1) * bmr[,index1] + bmr[,index1])
b.pi0[b.pi0 <= 0] <- 1e-8
b.pi1[b.pi1 <= 0] <- 1e-8
b.pi0[b.pi0 >= 1] <- 1 - 1e-8
b.pi1[b.pi1 >= 1] <- 1 - 1e-8
llmtx0 <- npos.e0 * log(1- b.pi0) + mutpos.e0 * log(b.pi0)
llmtx1 <- npos.e1 * log(1- b.pi1) + mutpos.e1 * log(b.pi1)
ll <- sum(llmtx0,na.rm=T) + sum(llmtx1,na.rm=T)
return(ll)
}
suppressWarnings(resn <- optim(0, lln, method="Nelder-Mead", control=list(fnscale=-1))) # BFGS has Error: non-finite finite-difference value [2]
suppressWarnings(resa <- optim(c(0,0), lla, method="Nelder-Mead", control=list(fnscale=-1)))
teststat<- -2*(resn$value-resa$value)
pvalue <- pchisq(teststat,df=1,lower.tail=FALSE)
res <- list("pvalue"=pvalue, "null.eta0" = resn$par, "alt.eta"=resa$par, "null.ll"= resn$value, "alt.ll" = resa$value)
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.