# Deconvolution, based on the generative model
.deconCoreSET <- function( S, E, strand, peak,
estDeltaSigma="common", lbDelta=25, lbSigma=25,
psize=21, Fratio=0.5, sindex, beta,
niter=50, mu_init, pi_init, pi0_init, delta_init, sigma_init,
L_table, stop_eps=1e-6, verbose=FALSE ) {
# construct grid
grid_min <- peak[1]
grid_max <- peak[2]
# search binding site only within peak region
grid_vec <- c(seq(from = grid_min, to = grid_max))
# initialization
N <- length(S)
n_group <- length(mu_init)
L <- E - S + 1
R <- grid_max - grid_min + 1
FRvec <- sindex$FRvec
indF <- sindex$indF
indR <- sindex$indR
mu <- mu_init
pi <- pi_init
pi0 <- pi0_init
delta <- delta_init
sigma <- sigma_init
alpha <- rep( 1, N )
# EM step
mu_mat <- matrix( NA, niter, n_group )
mu_mat[1,] <- mu_init
pi_mat <- matrix( NA, niter, n_group )
pi_mat[1,] <- pi_init
pi0_vec <- rep( NA, niter )
pi0_vec[1] <- pi0_init
delta_vec <- rep( NA, niter )
delta_vec[1] <- delta_init
sigma_vec <- rep( NA, niter )
sigma_vec[1] <- sigma_init
ll <- rep( NA, niter )
ll[1] <- -Inf
for ( i in seq(from = 2, to = niter) ) {
if ( verbose ) {
print( paste("------------ iteration:",i,"------------") )
}
########################################################################
# #
# E step: update Z #
# #
########################################################################
Z <- matrix( NA, N, n_group )
for ( g in seq_len(n_group) ) {
Z[,g] <- pi[g] * ( ( Fratio * dnorm( S, mu[g] - delta, sigma ) * indF +
( 1 - Fratio ) * dnorm( E, mu[g] + delta, sigma ) * indR ) )
}
Z0 <- FRvec * pi0 / ( R + beta - 1 )
Znorm <- .ff_normalize( cbind(Z0,Z) )
Z0 <- Znorm[ , 1 ]
Z <- Znorm[ , -1, drop=FALSE ]
########################################################################
# #
# CM step: update mu #
# #
########################################################################
# M step: update mu
for ( g in seq_len(n_group) ) {
sumF <- sum( Z[,g] * ( S + delta ) * indF )
sumR <- sum( Z[,g] * ( E - delta ) * indR )
mu[g] <- ( sumF + sumR ) / sum( Z[,g] )
}
# M step: update pi
for ( g in seq_len(n_group) ) {
pi[g] <- sum( Z[,g] ) / N
}
pi0 <- sum( Z0 ) / N
# safe guard for pi0: when signal is weak, do not use pi0
if ( pi0 > max(pi) ) {
pi0 <- 0
pi <- pi / sum(pi)
}
# M step: update delta & sigma, if common peak shape is not used
if ( estDeltaSigma == "separate" ) {
# M step: update delta
delta <- 0
for ( g in seq_len(n_group) ) {
delta <- delta +
sum( Z[,g] * ( ( mu[g] - S ) * indF + ( E - mu[g] ) * indR ) )
}
delta <- delta / N
# M step: update sigma
sigma2 <- 0
for ( g in seq_len(n_group) ) {
sigma2 <- sigma2 + sum( Z[,g] * ( ( S - mu[g] + delta )^2 * indF +
( E - mu[g] - delta )^2 * indR ) )
}
sigma <- sqrt( sigma2 / N )
# safe guard for delta & sigma
if ( delta < lbDelta ) {
delta <- lbDelta
}
if ( sigma < lbSigma ) {
sigma <- lbSigma
}
}
########################################################################
# #
# Identifiability, over-fitting, track estimates & loglik #
# #
########################################################################
# identifiability problem: order constraint on \mu values
pi <- pi[ order(mu) ]
mu <- mu[ order(mu) ]
# check over-fitting -> reduce dimension
# (avoid identifiability problem due to over-fitting)
if ( n_group >= 2 ) {
# condition: distance <= psize
mu_new <- pi_new <- c()
mu_current <- mu[1]
pi_current <- pi[1]
for ( g in seq(from = 2, to = n_group) ) {
if ( abs( mu[g] - mu_current ) <= psize ) {
mu_current <- ( mu_current + mu[g] ) / 2
pi_current <- pi_current + pi[g]
} else {
mu_new <- c( mu_new, mu_current )
pi_new <- c( pi_new, pi_current )
mu_current <- mu[g]
pi_current <- pi[g]
}
}
mu_new <- c( mu_new, mu_current )
pi_new <- c( pi_new, pi_current )
mu <- mu_new
pi <- pi_new
n_group <- length(mu)
# check over-fitting \pi < 0.01 -> reduce dimension
# (avoid identifiability problem due to over-fitting)
if ( any( pi < 0.01 ) ) {
mu <- mu[ pi > 0.01 ]
pi <- pi[ pi > 0.01 ]
n_group <- length(mu)
}
norm_const <- ( 1 - pi0 ) / sum(pi)
pi <- pi * norm_const
}
# safeguard: if only one component & pi decrases, just stop
if ( n_group == 1 & pi[1] < 0.10 ) {
# use estimates in the last iteration
mu <- mu_mat[ (i-1), !is.na(mu_mat[(i-1),]) ]
mu_mat[ i, ] <- NA
mu_mat[ i, seq_len(length(mu)) ] <- mu
pi <- pi_mat[ (i-1), !is.na(pi_mat[(i-1),]) ]
pi_mat[ i, ] <- NA
pi_mat[ i, seq_len(length(pi)) ] <- pi
pi0 <- pi0_vec[(i-1)]
pi0_vec[i] <- pi0
delta <- delta_vec[(i-1)]
delta_vec[i] <- delta
sigma <- sigma_vec[(i-1)]
sigma_vec[i] <- sigma
# stop iteration
mu_mat <- mu_mat[ seq_len(i), , drop=FALSE ]
pi_mat <- pi_mat[ seq_len(i), , drop=FALSE ]
pi0_vec <- pi0_vec[ seq_len(i) ]
delta_vec <- delta_vec[ seq_len(i) ]
sigma_vec <- sigma_vec[ seq_len(i) ]
ll <- ll[ seq_len(i) ]
break;
}
# track estimates
mu_mat[ i, seq_len(length(mu)) ] <- mu
pi_mat[ i, seq_len(length(pi)) ] <- pi
pi0_vec[i] <- pi0
delta_vec[i] <- delta
sigma_vec[i] <- sigma
if ( verbose ) {
print( "mu:" )
print( mu )
print( "pi:" )
print( pi )
print( "pi0:" )
print( pi0 )
print( "delta:" )
print( delta )
print( "sigma:" )
print( sigma )
}
# track complete log likelihood
ll[i] <- .loglikSET( S, E, strand, mu, pi, pi0, delta, sigma,
Fratio, sindex, beta, R, alpha )
if ( verbose ) {
print( "increment in loglik:" )
print( ll[i]-ll[(i-1)] )
}
# if loglik decreases, stop iteration
if ( ll[i] < ll[(i-1)] ) {
# use estimates in the last iteration
mu <- mu_mat[ (i-1), !is.na(mu_mat[(i-1),]) ]
mu_mat[ i, ] <- NA
mu_mat[ i, seq_len(length(mu)) ] <- mu
pi <- pi_mat[ (i-1), !is.na(pi_mat[(i-1),]) ]
pi_mat[ i, ] <- NA
pi_mat[ i, seq_len(length(pi)) ] <- pi
pi0 <- pi0_vec[(i-1)]
pi0_vec[i] <- pi0
delta <- delta_vec[(i-1)]
delta_vec[i] <- delta
sigma <- sigma_vec[(i-1)]
sigma_vec[i] <- sigma
# stop iteration
mu_mat <- mu_mat[ seq_len(i), , drop=FALSE ]
pi_mat <- pi_mat[ seq_len(i), , drop=FALSE ]
pi0_vec <- pi0_vec[ seq_len(i) ]
delta_vec <- delta_vec[ seq_len(i) ]
sigma_vec <- sigma_vec[ seq_len(i) ]
ll <- ll[ seq_len(i) ]
break;
}
# check whether to stop iterations
if ( ll[i] - ll[(i-1)] < stop_eps ) {
# stop if no improvement in loglik
if ( verbose ) {
print( "stop because there is no improvements in likelihood." )
}
mu_mat <- mu_mat[ seq_len(i), , drop=FALSE ]
pi_mat <- pi_mat[ seq_len(i), , drop=FALSE ]
pi0_vec <- pi0_vec[ seq_len(i) ]
delta_vec <- delta_vec[ seq_len(i) ]
sigma_vec <- sigma_vec[ seq_len(i) ]
ll <- ll[ seq_len(i) ]
break;
} else {
# stop if no improvement in estimates
if ( length(which(!is.na(mu_mat[i,]))) == length(which(!is.na(mu_mat[(i-1),]))) &&
all( abs( mu_mat[i,!is.na(mu_mat[i,])] - mu_mat[(i-1),!is.na(mu_mat[(i-1),])] ) < stop_eps ) &&
all( abs( pi_mat[i,!is.na(pi_mat[i,])] - pi_mat[(i-1),!is.na(pi_mat[(i-1),])] ) < stop_eps ) &&
abs( pi0_vec[i] - pi0_vec[(i-1)] ) < stop_eps &&
abs( delta_vec[i] - delta_vec[(i-1)] ) < stop_eps ) {
if ( verbose ) {
print( "stop because there is no improvements in estimates." )
}
mu_mat <- mu_mat[ seq_len(i), , drop=FALSE ]
pi_mat <- pi_mat[ seq_len(i), , drop=FALSE ]
pi0_vec <- pi0_vec[ seq_len(i) ]
delta_vec <- delta_vec[ seq_len(i) ]
sigma_vec <- sigma_vec[ seq_len(i) ]
ll <- ll[ seq_len(i) ]
break;
}
}
}
aicValue <- -2 * .loglikSET( S, E, strand, mu, pi, pi0, delta, sigma,
Fratio, sindex, beta, R, alpha ) + (2*n_group+3) * 2
bicValue <- -2 * .loglikSET( S, E, strand, mu, pi, pi0, delta, sigma,
Fratio, sindex, beta, R, alpha ) + (2*n_group+3) * log(N)
return( list( mu=mu, pi=pi, pi0=pi0, delta=delta, sigma=sigma,
mu_mat=mu_mat, pi_mat=pi_mat, pi_vec=pi0_vec, delta_vec=delta_vec, sigma_vec=sigma_vec,
Z=Z, Z0=Z0, loglik=ll, AIC=aicValue, BIC=bicValue ) )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.