# R/MIXnorm.R In S-YIN/FFPEnorm: FFPE RNA-Seq Normalization

#### Documented in dzipfunc_MIXnormMIXnorm

```#' log likelihood for zero inflated Poisson
#'
#' Input is a matrix with column specific Poisson mean and row specific probability of extra zero
#' @param x is a matrix, x[j,i] follows ZIP(delta[i],pi_j[j])
#' @param delta is a vector for poisson means
#' @param pi_j is a vector for prob. of extra zero
#' @return the log-likelihood of each row of x
dzip <- function(x, delta,pi_j)
{
# zip_llh = ZIP log-likelihood matrix for I*J data points
zip_llh <- matrix(NA,dim(x)[1],dim(x)[2])
for (j in 1: dim(x)[1])
{
indictr <- ifelse(x[j,]>0,1,0)
# if data contain zeros, use ZIP pmf
if (sum(indictr)!=0)
{
zip_llh[j,] <- indictr*(log(1-pi_j[j])+dpois(as.numeric(x[j,]),delta,T))+(1-indictr)*(log(pi_j[j]+(1-pi_j[j])*(exp(-delta))))
}
# if no zero, use Poisson pmf
if (sum(indictr)==0)
{
zip_llh[j,] <- (log(pi_j[j]+(1-pi_j[j])*(exp(-delta))))
}
}
den_zip <- rowSums(zip_llh)
den_zip
# returns the log-likelihood of each row of x
}

#' Main function for the nested EM algorithm for MIXnorm.
#'
#' This function takes the raw data as input and find the MLE of the mixture model.
#' @param dat input raw read count matrix. dim(dat)=J genes * I samples.
#' @param max_iter maximum number of iterations for the nested EM algorithm default is 20, recommend range (10, 50).
#' @param tol convergency criteria, default is 1e-2, recommend range (1e-5,1).
#' @param log_file file name that records the log.
#' @return the MLE of all parameters
#'
#' @export
#' @import truncnorm
MIXnorm <- function(dat, max_iter=20, tol=1e-2,log_file)
{
cat('Start running MIXnorm...',file=log_file,append=F)
cat('\n',file=log_file,append=T)
#return nonzero exit code is error occured
exit.code <- 1
cat('Start running MIXnorm... \n')
#check if the input arguments are correct
if (sum(dat%%1!=0)!=0)
{
cat('Error: Input data is not integer-valued read count matrix. MIXnorm terminated.',file=log_file,append=T)
cat('\n',file=log_file,append=T)
stop("Input data should be integer-valued read count matrix")
}

if (max_iter>50|max_iter<10)
{
cat('Error: Use proper maximum number of iteration (10, 50). MIXnorm terminated.',file=log_file,append=T)
cat('\n',file=log_file,append=T)
stop("max_iter is out of range")
}

if (tol>1|max_iter<1e-5)
{
cat('Error: Use proper convergence criteria (1e-5, 1). MIXnorm terminated.',file=log_file,append=T)
cat('\n',file=log_file,append=T)
stop("tol is out of range")
}
#end of checking

cat('Producing natural log transformation of input data',file=log_file,append=T)
cat('\n',file=log_file,append=T)

#deal with NA
dat[is.na(dat)] <- 0
row.names(dat) <- 1:dim(dat)[1]

#log-transformed data
log_dat <- as.data.frame(log(dat+1))
not_zero <- dat != 0
#I = #of zero's in each row
I <- rowSums(not_zero)
samp_no <- dim(dat)[2]

cat('Running iteration 1 ...',file=log_file,append=T)
cat('\n',file=log_file,append=T)
cat('Running iteration 1 ... \n')

#set initial values for nested EM algorithm
#normal mean for each patient
mu_i <- mu_i_final <- colMeans(log_dat)
#normal standard deviation
sigma_i <- sigma_i_final <- apply(log_dat, 2, sd)
#proportion of 0's from point mass of each gene
zip_pi <- pi_j <- 1-I/samp_no
#proportion of genes that are truely expressed
phi <- nonzero_prop <- mean(pi_j < .5)
#poisson means for background read of genes not expressed
delta <- poi_mean <- colMeans(dat[pi_j > .5,])
delta[delta==0] <- 0.001
zip_pi[I==0] <- pi_j[I==0] <- 1
zip_pi[I==dim(dat)[2]] <- pi_j[I==dim(dat)[2]] <- 0
#mu_i_final, sigma_i_final, phi, delta and pi_j are parameters after update

#run 1 iteration of E step
#outer E step: compute E(D_j|obs)
log_numtor <- apply(matrix(log(truncnorm::dtruncnorm(t(log_dat),a=0,b=Inf, mu_i, sigma_i)),dim(log_dat)[2],dim(log_dat)[1]), 2, sum)
log_dentor1 <- dzip(x=dat,delta=delta,pi_j=pi_j)
# exp(-746)=0 in R, the following 6 lines correct for the numerical issues
id_num <- which(log_numtor <= log_dentor1 & log_dentor1 <= (-700))
log_numtor[id_num] <- log_numtor[id_num]-log_dentor1[id_num]
log_dentor1[id_num] <- 0
id_den <- which(log_dentor1 <= log_numtor & log_numtor <= (-700))
log_dentor1[id_den] <- log_dentor1[id_den]-log_numtor[id_den]
log_numtor[id_den] <- 0

numtor <- phi*exp(log_numtor)
dentor1 <- (1-phi)*exp(log_dentor1)
dentor <- numtor+dentor1

D <- numtor/dentor
D[is.na(D)] <- 1
D[I==dim(dat)[2]] <- 1
D[I==0] <- 0
#end of outer E step

#inner E step: compute E(Z_ij|obs,D_j)
Z <- dat
for (i in 1:dim(dat)[2])
{
Z[,i] <- pi_j^(1-D)/(pi_j^(1-D)+(1-pi_j)^(1-D)*exp(-delta[i]*(1-D)))
}
Z[dat!=0] <- 0

#inner E step: compute E(T_i|obs,D)
beta <- -mu_i/sigma_i
T_i <- sum(D)*pnorm(beta)/(1-pnorm(beta))

#sum of augment data
mis_sum <- T_i*(mu_i-dnorm(-beta)*sigma_i/pnorm(beta))
#sum of augment data squares
mis_sum_sq <- T_i*(sigma_i^2+mu_i^2+sigma_i^2*(-beta)*dnorm(-beta)/pnorm(beta)-2*mu_i*sigma_i*dnorm(beta)/pnorm(beta))
#end of inner E step

iter <- 1
tolerance <- 9999

while (tolerance > tol)
{
iter <- iter + 1
if (iter > max_iter)
{
cat(paste('Algorithm exceeded maximum iteration (', max_iter, ') and stopped.'),file=log_file,append=T)
cat('\n',file=log_file,append=T)
warning(paste('Algorithm exceeded maximum iteration (', max_iter, ') and stopped.'))
break
}
cat(paste('Running iteration',iter,  '...'),file=log_file,append=T)
cat('\n',file=log_file,append=T)
cat(paste('Running iteration',iter,  '...\n'))
#outer M step
#update phi
phi <- mean(D)
#end of outer M step

#Inner EM to update (pi_j, delta_i) and (mu_i,sigma_i) K=5 times
K <- 1
while(K<=5)
{
#inner M step
pi_j <- rowMeans(Z)

deltanumtor <- colSums((1-D)*(1-Z)*dat)

mu_i_final <- (colSums(log_dat*D)+mis_sum)/(sum(D)+T_i)
sigma_i_final <- sqrt((colSums(log_dat^2*D)+mis_sum_sq)/(sum(D)+T_i)-mu_i_final^2)
#end of innter M step

#inner E (E2) step for Z (ZIP component)
if (K!=5)
{
for (i in 1:dim(dat)[2])
{
Z[,i] <- pi_j^(1-D)/(pi_j^(1-D)+(1-pi_j)^(1-D)*exp(-delta[i]*(1-D)))
}
Z[dat!=0] <- 0

#inner E (E2) step for T and Lt (Truncated Normal component augment data)
beta <- -mu_i_final/sigma_i_final
T_i <- sum(D)*pnorm(beta)/(1-pnorm(beta))
mis_sum <- T_i*(mu_i_final-dnorm(-beta)*sigma_i_final/pnorm(beta))
mis_sum_sq <- T_i*(sigma_i_final^2+mu_i_final^2+sigma_i_final^2*(-beta)*dnorm(-beta)/pnorm(beta)-2*mu_i_final*sigma_i_final*dnorm(beta)/pnorm(beta))
}
K <- K+1
}

#outer E step
log_numtor <- apply(matrix(log(truncnorm::dtruncnorm(t(log_dat),a=0,b=Inf, mu_i_final, sigma_i_final)),dim(log_dat)[2],dim(log_dat)[1]), 2, sum)
log_dentor1 <- dzip(x=dat,delta=delta,pi_j=pi_j)
#following 6 lines correct for the numerical issues
id_num <- which(log_numtor <= log_dentor1 & log_dentor1 <= (-700))
log_numtor[id_num] <- log_numtor[id_num]-log_dentor1[id_num]
log_dentor1[id_num] <- 0
id_den <- which(log_dentor1 <= log_numtor & log_numtor <= (-700))
log_dentor1[id_den] <- log_dentor1[id_den]-log_numtor[id_den]
log_numtor[id_den] <- 0

numtor <- phi*exp(log_numtor)
dentor1 <- (1-phi)*exp(log_dentor1)
dentor <- numtor+dentor1

D <- numtor/dentor
D[is.na(D)] <- 1
D[I==dim(dat)[2]] <- 1
D[I==0] <- 0
#end of outer E step

tolerance <- max(abs(mu_i_final-mu_i),abs(sigma_i_final-sigma_i),abs(nonzero_prop-phi))
cat(paste('Maximum parameter change between current and previous iteration is',tolerance),file=log_file,append=T)
cat('\n',file=log_file,append=T)

#update all parameters
mu_i <- mu_i_final
sigma_i <- sigma_i_final
nonzero_prop <- phi
poi_mean <- delta
zip_pi <- pi_j
}

if (iter <= max_iter)
{
cat(paste0('\n','Algorithm converges at ', iter,'th iteration.\n' ))
cat(paste0('\n','Algorithm converges at ', iter,'th iteration.\n' ),file=log_file,append=T)
cat('Exit nested EM algorithm and return MLE\n',file=log_file,append=T)
#successivefully obtained MLE, set exit.code to zero
exit.code <- 0
}

return(list(mu_i_final = mu_i_final,
sigma_i_final = sigma_i_final,
D = D,
phi = phi,
delta = delta,
pi_j = pi_j,
exit.code = exit.code))
}

#' Produce MIXnorm normalized expression matrix
#'
#' This function calls the MIXnorm function to obtain MLE of the mixture model,
#' then produces the normalized expression matrix.
#' @param dat input raw read count matrix. dim of dat = J genes * I samples.
#' @param max_iter maximum number of iterations for the nested EM algorithm default is 20, recommend range (10, 50).
#' @param tol convergency criteria, default is 1e-2, recommend range (1e-5,1).
#' @param log_file file name that records the log. Default log file name is MIXnorm.log.
#' @param appr binary True of False, indicates if the approximate version of normalization should be used.
#' @return A list contains the normalized expression matrix, proportion of expressed genes and probabilities of being expressed for all genes.
#' @export
func_MIXnorm <- function(dat,max_iter = 20, tol = 1e-2,log_file="MIXnorm.log",appr=T)
{
status_file=paste(log_file,'status',sep='.')
MIX_res <- MIXnorm(dat,max_iter=max_iter, tol=tol, log_file=log_file)

cat('Calculating the normalized expression \n' ,file=log_file,append=T)
beta <- (-MIX_res\$mu_i_final)/MIX_res\$sigma_i_final
subt <- matrix(rep((MIX_res\$mu_i_final+dnorm(beta)*MIX_res\$sigma_i_final/(1-pnorm(beta))),each=dim(dat)[1]),dim(dat)[1],dim(dat)[2])
#MIX_log_normalized returns the normalized data in log scale
if (!appr)
{
MIX_normalized_log <- as.data.frame(t(MIX_res\$D*(log(dat + 1)-subt)))
}
if (appr)
{
MIX_normalized_log <- as.data.frame(t((MIX_res\$D>0.5)*(log(dat + 1)-subt)))
}
names(MIX_res\$D) <- rownames(dat)
if (MIX_res\$exit.code!=0)
{
cat('1',file=status_file,append=F)
warning(paste('Algorithm exceeded maximum iteration (', max_iter, ') and stopped.'))
}
if (MIX_res\$exit.code==0)
{
cat('Normalization finished successfully \n' ,file=log_file,append=T)
cat('0',file=status_file,append=F)
}
return(list(MIX_normalized_log = t(MIX_normalized_log), phi = MIX_res\$phi, D = MIX_res\$D))
}
```
S-YIN/FFPEnorm documentation built on Nov. 20, 2019, 7:05 a.m.