#' Function to produce a sequencing read at one locus with converage=ndepth from given a genotype and the error scores (one error for one read).
#'
#' It uses formula \eqn{p(D=g|G_1=g)=1-e,p(D=other\ call|G_1=g)=e/3} to generate the read for a given genotype \eqn{G_1} and error rate \eqn{e}.
#' It is called from the \code{\link{generate_seqdata}} functions.
#' @param genotype vector of two alleles for a given genotype, ie. genotype CT is given as ('C', 'T')
#' @param error vector of sequencing errors, length of this vector is the same as read depth (ndepth).
#' @param ndepth the read depth (integer)
#' @return a row vector of sequence reads consisting of 'A','G','T' or 'C' for a single variant with length the same as read depth (ndepth).
#' @export
seq_call <- function(genotype, error, ndepth){
if(length(error) != ndepth) {
cat ('dimension error in seq_call.\n')
return(NULL)
}
vect_reads = rep(NA,ndepth)
all_alleles = c('A','G','T','C')
for (i in 1:ndepth){
s = rbinom(1,1,0.5)+1
g = genotype[s]
ng = all_alleles[all_alleles!=g]
e3 = error[i]/3
e23 = e3*2
e = e3*3
r = runif(1)
if (r <= e3) {
vect_reads[i] <- ng[1]
}else if (r > e3 & r <= e23) {
vect_reads[i] <- ng[2]
}else if (r > e23 & r <= e) {
vect_reads[i] <- ng[3]
}else{
vect_reads[i] <- g
}## end if r
}# end for loop
return (vect_reads)
}
#' Function to compute the genotype likelihood of a read given the sequencing error rate \eqn{e} and true genotype \eqn{G_1G_2}: \eqn{P(D=g|G=G_1G_2,e), g, G_1, G_2 } have values of A, C, G or T.
#'
#'This function uses the formula \eqn{P(D=g|G=G_1G_2)=1/2*P(D=g|G_1)+1/2*P(D=g|G_2)} and \eqn{p(D=g|G_1=g)=1-e,p(D=other\ call|G_1=g)=e/3}
#'to compute the genotype likelihood \eqn{P(D=g|G=G_1G_2,e)}, more details are in Appendix B of the paper. It is called from functions \code{\link{calc_pobs_ndepth}} and \code{\link{calc_Mr}}.
#'
#' @param sing_read single read, e.g. A.
#' @param A1 allele 1 in the true genotype, e.g. if \eqn{G_1G_2=AC} then A1='A'
#' @param A2 allele 2 in the true genotype, e.g. A2='C'
#' @param error sequencing error, decimal with value between 0 and 1.
#' @return likelihood of observing sing_read given genotype (A1A2) and error rate
calc_psingle = function(sing_read,A1,A2,error){
if (sing_read == A1){
p1 = 1-error
}else{
p1 = error/3
} ## end if
if (sing_read == A2){
p2 = 1-error
}else{
p2 = error/3
} ## end if
p = 1/2*p1 + 1/2*p2
return (p)
}
#' Function to calculate the likelihood matrix \eqn{P(D|G)} given a vector of error rates and a vector of sequence reads.
#'
#' This function calculates a likelihood matrix \eqn{P(D|G)} given a vector of error rates and a vector of sequence reads, where D is the sequencing data and G is the true genotype. It uses function \code{\link{calc_psingle}} to calculate the likelihood for each read given one possible genotype (AA, Aa or aa).
#' It is called from function \code{\link{generate_seqdata}}.
#'
#' @param Error vector of sequencing error rates. Similarly as error in the function \code{\link{seq_call}}, but concatenate all error values over all samples.
#' @param vect_row_reads row vector of sequence reads consisting of 'A','G','T' or 'C', similar as output from function \code{\link{seq_call}}, but concatenated over all samples in same fashion as Error.
#' @param ndepth integer, read depth, should have value the same as length of error or length of vect_row_reads.
#' @return a matrix with 3 columns representing P(read|AA), P(read|Aa) and P(read|aa), where row numbers equal to ndepth.
calc_pobs_ndepth= function(Error,vect_row_reads,ndepth=1){
L = length(Error)
L2=length(vect_row_reads)
if(L!=ndepth || L2!=ndepth) {
cat("Lengths mismatch between error, reads and read-depth in cal_pobs_ndepth.\n");
return(NULL)
}
M = NULL
LG =c('TT','CT','CC')
for (i in 1:L){
m = NULL
for (j in 1:3){
G = LG[j]
A1 = substring(G,1,1)
A2 = substring(G,2,2)
m = c(m,calc_psingle(vect_row_reads[i],A1,A2,Error[i]))
}
M = rbind(M,m)
}
return (M)
}
#' Function to calcualte likelihood \eqn{P(D|G)} for simulating data, more general case of \code{\link{calc_pobs_ndepth}}
#'
#' This function calculates a likelihood matrix \eqn{P(D|G)} given a vector of error rates and a vector of sequence reads, where D is the sequencing data and G is the true genotype. It uses function \code{\link{calc_psingle}} to calculate the likelihood for each read given one possible genotype (AA, Aa or aa).
#' It is a more general case of \code{\link{calc_pobs_ndepth}} with an additional param rdv, and called from \code{\link{generate_seqdata}}.
#'
#' @param rdv integer vector of read depths concatenated for all samples.
#' @param error vector of base call error rates; Uses the same error input in the \code{seq_call} function but concatenate all error values over all samples.
#' @param vect_row_reads row vector of sequence reads consisting of 'A','G','T' or 'C', same as output from function \code{seq_call}, but concatenated over all samples in same fashion as error.
#' @param ndepth read depth (integer), it eaquals to the length of error or vect_row_reads, or sum of vector rdv
#' @return a matrix (M) with 3 columns representing P(read|AA), P(read|Aa) and P(read|aa), where row dimensions equal length(error)=length(vect_row_reads).
calc_Mr = function(error,vect_row_reads,ndepth=1,rdv){
L = length(error)
L2=length(vect_row_reads)
if(L!=ndepth || L2!=ndepth) {cat("Lengths mismatch between error, reads and read-depth in calc_Mr."); return(NULL)}
L = length(rdv)
M = NULL
LG =c('TT','CT','CC')
S = 0
for (i in 1:L){
m = NULL
for (j in 1:3){
LL = 1
for (kk in 1:rdv[i]){
G = LG[j]
A1 = substring(G,1,1)
A2 = substring(G,2,2)
LL = LL*calc_psingle(vect_row_reads[S+kk],A1,A2,error[S+kk])
}
m =c(m,LL)
}
S =S+rdv[i]
M = rbind(M,m)
}
return (M)
}
#' Function to calculate the conditional expected genotype probability \eqn{E(P(G_{ij}|D_{ij}))} given the genotype likelihoods \eqn{P(D_{ij}|G_{ij}=g)} and frequencies. Simplified version of \code{\link{calc_EG_general}}.
#'
#' Calculate the conditional expected genotype probability given the observed sequencing data \eqn{D_{ij}} from formula \eqn{E(P(G_{ij}|D_{ij}))=\sum_{g=0}^2 gP(G_{ij}=g|D_{ij})}, where \eqn{ P(G_{ij}=g|D_{ij})=P(D_{ij}|G_{ij}=g)*P(G_{ij}=g)/P(D_{ij})}.
#' \eqn{P(D_{ij}|G_{ij}=g)} (input M) is from the VCF file or function \code{\link{calc_pobs_ndepth}}, \eqn{p(G_{ij}=g)} (input p) is output from function \code{\link{calc_EM}}.
#' All values for \eqn{P(G_{ij}=g|D_{ij})} are scaled by \eqn{P(D_{ij})}.
## It is called by \code{\link{generate_seqdata}}.
## talk to Andriy, he said in the kk loop, he did it for more general case, we only need rdv=1 for our case.
## so I renamed Andriy's get_EG to calc_EG_general, delete the kk loop in calc_EG function.
## all values are rescaled by P(D_ij), (see m below, but it is not probability, so m/sum(m) gives us sth. like prob.
#'
#' @param M genotype likelihoods with dimension of number of sample times 3 (double), each column is the genotype likelihood \eqn{P(D_{ij}|G=AA, Aa\ or\ aa)} for one locus. It uses output from \code{\link{get_exp_geno}} or \code{\link{get_exp_MAF}} for VCF input.
#' @param p genotype frequencies \eqn{P(G=AA, Aa\ or\ aa)} or \eqn{p(G=0,1,2\ minor\ allele)} for each SNP, it is a vector of length 3 and uses the output from function \code{\link{calc_EM}}.
#' @param rdv read depth for all samples. Dummy variable in \code{\link{calc_EG}}, listed here to be consitent with \code{\link{calc_EG_general}}. It is a vector of integers of one with length equal to the number of samples.
#' @return a vector with the same length as rdv, containing conditional expectation probability \eqn{ E(P(G_{ij}|D_{ij}))}.
calc_EG <- function(M, p, rdv) {
LL = length(rdv)
EG = NULL
g = c(0,1,2)
for (i in 1:LL){
m = NULL
### m=(M[i,1]*p[1],M[i,2]*p[2],M[i,3]*p[3])
for (j in 1:3){
L=M[i,j]
m = c(m,L*p[j])
}
pm = sum(m/sum(m)*g)
EG = c(EG,pm)
}
return (EG)
}
#' More general function to calculate the conditional expected genotype probability \eqn{E(P(G_{ij}|D_{ij}))} given the genotype likelihoods \eqn{P(D_{ij}|G_{ij}=g)} and frequencies, see \code{\link{calc_EG}} for simplified version.
#'
#'
#' Calculate the conditional expected genotype probability given the observed sequencing data \eqn{D_{ij}} from formula \eqn{E(P(G_{ij}|D_{ij}))=\sum_{g=0}^2 gP(G_{ij}=g|D_{ij})}, where \eqn{ P(G_{ij}=g|D_{ij})=P(D_{ij}|G_{ij}=g)*P(G_{ij}=g)/P(D_{ij})}. It is called from \code{\link{generate_seqdata}}.
#'
#' @param M genotype likelihoods with dimension of number of sample by 3 (double), each column is the genotype likelihood \eqn{P(D_{ij}|G=AA, Aa\ or\ aa)} for one locus; uses output from \code{\link{calc_pobs_ndepth}} for simulation data.
#' @param p genotype frequencies \eqn{P(G=AA, Aa\ or\ aa)} for each variant; output from function \code{\link{calc_EM}}.
#' @param rdv read depth for all samples, should be a vector of integer with length equal to number of samples.
#' @return a vector of containing conditional expected genotype probability. Length should be the same as rdv.
#'
calc_EG_general= function(M,p,rdv){
LL = length(rdv)
S = 0
EG = NULL
g = c(0,1,2)
for (i in 1:LL){
m = NULL
for (j in 1:3){
L = 1
for (kk in 1:rdv[i]){
L = L*M[S + kk,j]
}
m = c(m,L*p[j])
}
S = S + rdv[i]
pm = sum(m/sum(m)*g)
## added by Jiafen, simulation found with EG<0.
if(pm<0) {pm=0}
if(pm>2) {pm=2}
EG = c(EG,pm)
}
return (EG)
}
#'Function to calculate the variance of conditional expected genotype probability \eqn{Var(E(P(G_{ij}|D_{ij})))}.
#'
#' This is the variance for the simplified function \code{\link{calc_EG}}. It used the formula \eqn{var(X)=E(X^2)-E(X)^2}, where \eqn{E(X)} is the same as the result of \code{\link{calc_EG}}.
## talk to Andriy, he said in the kk loop, he did it for more general case, we only need rdv=1 for our case.
#### uses output from \code{calc_pobs_ndepth} function for simulation data and output from \code{\link{get_exp_geno}} or \code{\link{get_exp_MAF}} for VCF input
#' @param M genotype likelihoods \eqn{P(D_{ij}|G=AA, Aa\ or\ aa)} for one locus, matrix of dimension number of samples by 3 (double);
#' @param p genotype frequencies \eqn{P(G=AA, Aa\ or\ aa)} (double); output from function \code{\link{calc_EM}}.
#' @param rdv read depth (vector of integers) for all samples
#' @return the variance of \eqn{E(P(G_{ij}|D_{ij}))}.
calc_EG_Var = function(M,p,rdv){
LL = length(rdv)
EG2 = NULL
g = c(0,1,2)
for (i in 1:LL){
m = NULL
for (j in 1:3){
L=M[i,j]
m = c(m,L*p[j])
}
pm = sum(m/sum(m)*g^2) - sum(m/sum(m)*g)^2
EG2 = c(EG2,pm)
}
return (EG2)
}
#' Function to calculate the variance of conditional expected genotype probability \eqn{Var(E(P(G_{ij}|D_{ij})))}, more general case of \code{\link{calc_EG_Var}}
#'
#' This is the variance for function \code{\link{calc_EG_general}}. It used the formula \eqn{var(X)=E(X^2)-E(X)^2}, where \eqn{E(X)} is the same as the result of \code{\link{calc_EG_general}}.
#'
#' @param M genotype likelihoods \eqn{P(D_{ij}|G=AA, Aa\ or\ aa)} for one locus, matrix of dimension number of samples by 3 (double);
#' @param p genotype frequencies \eqn{P(G=AA, Aa\ or\ aa)} (double); output from function \code{\link{calc_EM}}.
#' @param rdv read depth (vector of integers) for all samples
#' @return the variance of \eqn{E(P(G_ij|D_ij))}
calc_EG_Var_general = function(M,p,rdv){
L = length(rdv)
S = 0
EG = NULL
g = c(0,1,2)
for (i in 1:L){
m = NULL
for (j in 1:3){
L = 1
for (kk in 1:rdv[i]){
L = L*M[S + kk,j]
}## end for kk
m = c(m,L*p[j])
}##end for j
S = S + rdv[i]
pm = sum(m/sum(m)*g^2) - sum(m/sum(m)*g)^2
EG = c(EG,pm)
}
return (EG)
}
#' Function to use EM algorithm to estimate the genotype frequencies in the sample
#'
#' Given a matrix (M) with dimension number of sample by 3 consisting of genotype likelihoods for each individual, this function uses the EM algorithm to estimate genotype frequencies.
#' It returns a vector with three decimals to stands for probability of 0, 1 or 2 minor alleles in the whole sample
#'
#' @param M matrix of genotype likelihoods derived from Phred-scale likelihood in VCF input for one variant.
#' @return a vector with three decimals to stands for probability of 0, 1 or 2 minor alleles.
#' @export
calc_EM <- function(M){
p_0 = 0.15
q_0 = 0.15
q_n = 1
p_n = 0
d_n = 0
k = 0
while ((p_n - p_0)^2 + (q_n - q_0)^2>0.000001){
d_0 = 1-p_0 - q_0
v = c(p_0,q_0,d_0)
p_D = M%*%(v)
E_p = M[,1]*p_0/p_D
E_q = M[,2]*q_0/p_D
p_n= p_0
q_n = q_0
d_n = 1-q_0-p_0
p_0 = sum(E_p)/length(E_p)
q_0 = sum(E_q)/length(E_q)
k = k+1
if (k==1000){
# cat('hi','\n')
return ( c(p_0, q_0, 1-p_0-q_0) )
}
}
return ( c(p_0, q_0, 1-p_0-q_0) )
}
#' Function to calculate the probability from given odds
#'
#' @param odds odds
#' @return probs probability corresponding to the given odds
odds.2.probs<-function(odds)
{
probs<-odds/(1+odds)
return(probs)
}
#' Function to simulate the correlated binary data with given OR.
#'
#' @param N Population size, greater than ncase+ncont
#' @param preval prevalence rate of the disease
#' @param mafco MAF for controls
#' @param OR the odds ratio
#' @param ncase Number of cases in the taken sample, taken from case of the N population
#' @param ncont Number of controls in the sample, take from the controls of N population
#' @return a dataframe include genotype data (x) and phenotype data (y) in each column
sim.corr.binary.data<-function(N,preval,mafco,OR,ncase,ncont)
{
Ncont=N-floor(N*preval)
prob.y0<-Ncont/N; prob.y1=1-prob.y0
## p(x=0,y=0)=p(y=0|x=0)*P(x=0)=p(y=0|x=0)*(P(x=0,y=0)+P(x=0,y=1))=(1/(1+exp(beta0)))*(P(x=0,y=0)+p(x=0,y=1))==>p(x=0,y=1)=exp(beta0)*P(x=0,y=0)
## similarly, P(x=1,y=1)=exp(beta0+beta1)*p(x=1,y=0); p(x=2,y=1)=exp(beta0+2beta1)*p(x=2,y=0)
### HWE
## p(x=0,y=0)=P(x=0|y=0)*p(y=0)=(1-mafco)^2*Ncont/N; p(x=1,y=0)=2*(1-mafco)*mafca*Ncont/N; p(x=2,y=0)=mafco^2*Ncont/N, sum(p(x=i,y=0))=p(y=0)=Ncont/N
## p(x=0,y=1)=(1-mafca)^2*Ncase/N; p(x=1,y=1)=2*mafca*(1-mafca)*Ncase/N; p(x=2,y=1)=mafca^2*Ncase/N
### so exp(beta0)*P(x=0,y=0)+exp(beta0+beta1)*p(x=1,y=0)+exp(beta0+2beta1)*p(x=2,y=0)=Ncase/N=1-Ncont/N
## exp(beta0)=(N-Ncont)/(p(x=0,y=0)+exp(beta1)*p(x=1,y=0)+exp(2beta1)*p(x=2,y=0))=(N-Ncont)/(Ncont*(mafca*(1-exp(beta1))-1)^2)
ebeta0=(N-Ncont)/(Ncont*(mafco*(1-OR)-1)^2)
## ebeta0=odds.y1.x0=exp(beta0); OR=ebeta1:=exp(beta1)
odds.y1.x0<-ebeta0;
odds.y1.x1<-ebeta0*OR
odds.y1.x2<- ebeta0*OR^2
# cat('ebeta0=',ebeta0,'\n')
Ncase=N-Ncont
## p(x=0,y=0), p(x=1,y=0), p(x=0,y=1), p(x=1,y=1)
prob.x0.y0=Ncont/N*(1-mafco)^2; prob.x1.y0=2*mafco*(1-mafco)*Ncont/N;
prob.x0.y1=odds.y1.x0*prob.x0.y0; prob.x1.y1=odds.y1.x0*OR*prob.x1.y0
Y=c(rep(1,Ncase),rep(0,Ncont))
tmp=runif(Ncase,min=0,max=prob.y1) ## prob(x=0,y=1)+prob(x=1,y=1)+prob(x=2,y=1)=prob(y=1)
x1<-ifelse(tmp>prob.x0.y1,ifelse(tmp<prob.x0.y1+prob.x1.y1,1,2),0)
tmp=runif(Ncont,min=0,max=prob.y0) #prob(x=0,y=0)+prob(x=1,y=0)+prob(x=2,y=0)=prob(y=0)
x0<-ifelse(tmp>prob.x0.y0,ifelse(tmp<prob.x0.y0+prob.x1.y0,1,2),0)
x=c(x1,x0)
case.index=sample(1:Ncase,ncase)
cont.index=sample((Ncase+1):N,ncont)
x=x[c(case.index,cont.index)]; y=Y[c(case.index,cont.index)]
dat<-data.frame(x=x,y=y)
return(dat)
}
#' Function to calculate expected genotype data for simuation data, given simulated reads, error rate and read depth.
#'
#' @param reads simulated reads
#' @param error_vector a vector for error rate for each simulated data of each person, one person by one person
#' @param rdepth_vector a vector for read depth for each simulated data of each person, one person by one person
#' @return a list includes expected genotype data (exp_geno) and population frequency (geno_freq)
calc_simu_exp<-function(reads,error_vector,rdepth_vector){
t_ndepth=sum(rdepth_vector)
M = calc_Mr(error_vector,reads,t_ndepth,rdepth_vector)
Mm = calc_pobs_ndepth(error_vector,reads,t_ndepth)
p = calc_EM(M)
# cat('i=',i,',p=',p,',sum_p=',sum(p),'\n')
p[p<0]=0
EG = calc_EG_general(Mm,p,rdepth_vector)
return(list(exp_geno=EG,geno_freq=p))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.