# R/normalize2.R In CODEX: A Normalization and Copy Number Variation Detection Method for Whole Exome Sequencing

```normalize2 = function (Y_qc, gc_qc, K, normal_index) {
if (max(K) > length(normal_index))
stop("Number of latent Poisson factors K cannot exceed the number of
normal samples!")
N = colSums(Y_qc)
Nmat = matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = N, byrow = TRUE)
Yhat = list(length = length(K))
AIC = rep(NA, length = length(K))
BIC = rep(NA, length = length(K))
RSS = rep(NA, length = length)
for (ki in 1:length(K)) {
k = K[ki]
message("k = ", k)
maxiter = 10
maxhiter = 50
BHTHRESH = 1e-04
HHTHRESH = 1e-04
iter = 1
fhat = matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = 0)
fhatnew = matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc))
betahat = rep(1, nrow(Y_qc))
betahatmat = matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
data = betahat, byrow = FALSE)
ghat = matrix(0, nrow = nrow(Y_qc), ncol = k)
hhat = matrix(0, nrow = ncol(Y_qc), ncol = k)
bhdiff = rep(Inf, maxiter)
fhdiff = rep(Inf, maxiter)
betahatlist = list(length = maxiter)
fhatlist = list(length = maxiter)
ghatlist = list(length = maxiter)
hhatlist = list(length = maxiter)
while (iter <= maxiter) {
gcfit = Y_qc/Nmat/betahatmat/exp(ghat %*% t(hhat))
fhatnew <- apply(gcfit, 2, function(z) {
spl <- smooth.spline(gc_qc, z)
temp <- predict(spl, gc_qc)\$y
temp[temp <= 0] <- min(temp[temp > 0])
temp
})
fhatnew[fhatnew<quantile(fhatnew, 0.005)]=quantile(fhatnew, 0.005)
betahatnew = apply((Y_qc/(fhatnew * Nmat * exp(ghat %*%
t(hhat))))[,normal_index], 1, median)
betahatnew[betahatnew <= 0] = min(betahatnew[betahatnew > 0])
bhdiff[iter] = sum((betahatnew - betahat)^2)/length(betahat)
fhdiff[iter] = sum((fhatnew - fhat)^2)/length(fhat)
if (fhdiff[iter] > min(fhdiff))
break
message("Iteration ", iter, "\t", "beta diff =", signif(bhdiff[iter],
3), "\t", "f(GC) diff =", signif(fhdiff[iter], 3))
fhat = fhatnew
betahat = betahatnew
betahatmat = matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
data = betahat, byrow = FALSE)
L = log(Nmat * fhat * betahatmat)
logmat = log(pmax(Y_qc, 1)) - L
logmat = logmat - matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
data = apply(logmat, 1, mean), byrow = FALSE)
hhat = svd(logmat, nu = k, nv = k)\$v
hhatnew = hhat
hiter = 1
hhdiff = rep(Inf, maxhiter)
while (hiter <= maxhiter) {
for (s in 1:nrow(Y_qc)) {
temp=try(glm(formula = Y_qc[s,normal_index] ~ hhat[normal_index,] -
1, offset = L[s,normal_index], family = poisson)\$coefficients,silent=TRUE)
if(is.character(temp)){
temp=lm(log(pmax(Y_qc[s,normal_index],1)) ~ hhat[normal_index,] -
1, offset = log(L[s,normal_index]))\$coefficients
}
ghat[s, ] = temp
}
# avoid overflow or underflow of the g latent factors
ghat[is.na(ghat)]=0
if(max(ghat) >= 30){
ghat=apply(ghat,2,function(z){
z[z>quantile(z,0.995)] = min(quantile(z,0.995),30)
z})
}
if(min(ghat) <= -30){
ghat=apply(ghat,2,function(z){
z[z<quantile(z,0.005)] = max(quantile(z,0.005),-30)
z})
}
for (t in 1:ncol(Y_qc)) {
hhatnew[t, ] = glm(formula = Y_qc[, t] ~ ghat -
1, offset = L[, t], family = poisson)\$coefficients
}
gh = ghat %*% t(hhatnew)
gh = gh - matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
data = apply(gh, 1, mean), byrow = FALSE)
hhatnew = svd(gh, nu = k, nv = k)\$v
hhdiff[hiter] = sum((hhatnew - hhat)^2)/length(hhat)
message("\t\t\t", "hhat diff =", signif(hhdiff[hiter], 3))
hhat = hhatnew
if (hhdiff[hiter] < HHTHRESH)
break
if (hiter > 10 & (rank(hhdiff))[hiter] <= 3)
break
hiter = hiter + 1
}
fhatlist[[iter]] = fhat
betahatlist[[iter]] = betahat
ghatlist[[iter]] = ghat
hhatlist[[iter]] = hhat
if (bhdiff[iter] < BHTHRESH)
break
if (iter > 5 & bhdiff[iter] > 1)
break
iter = iter + 1
}
optIter = which.min(fhdiff)
message(paste("Stop at Iteration ", optIter, ".", sep = ""))
fhat = fhatlist[[optIter]]
betahat = betahatlist[[optIter]]
ghat = ghatlist[[optIter]]
hhat = hhatlist[[optIter]]
betahatmat = matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
data = betahat, byrow = FALSE)
Yhat[[ki]] = pmax(round(fhat * Nmat * betahatmat * exp(ghat %*% t(hhat)),0),1)
AIC[ki] = 2 * sum(Y_qc * log(pmax(Yhat[[ki]],1)) - Yhat[[ki]]) -
2 * (length(ghat) + length(hhat))
BIC[ki] = 2 * sum(Y_qc * log(pmax(Yhat[[ki]],1)) - Yhat[[ki]]) -
(length(ghat) + length(hhat)) * log(length(Y_qc))
RSS[ki] = sum((Y_qc - Yhat[[ki]])^2/length(Y_qc))
message("AIC", k, " = ", round(AIC[ki], 3))
message("BIC", k, " = ", round(BIC[ki], 3))
message("RSS", k, " = ", round(RSS[ki], 3), "\n")
}
return(list(Yhat = Yhat, AIC = AIC, BIC = BIC, RSS = RSS, K = K))
}
```

## Try the CODEX package in your browser

Any scripts or data that you put into this service are public.

CODEX documentation built on Nov. 8, 2020, 8:22 p.m.