# dnds1 and dnds2 are dndsout objects generated by running dndscv on a list of known cancer genes in two datasets.
lik_func = function(dnds1, dnds2) {
# Subfunction: Analytical opt_t using only neutral subs
mle_tcv = function(n_neutral, exp_rel_neutral, shape, scale) {
tml = (n_neutral+shape-1)/(exp_rel_neutral+(1/scale))
if (shape<=1) { # i.e. when theta<=1
tml = max(shape*scale,tml) # i.e. tml is bounded to the mean of the gamma (i.e. y[9]) when theta<=1, since otherwise it takes meaningless values
}
return(tml)
}
my_selfun_cv = function(j) {
# Define variables for group A
xA_N = dnds1$RefCDS[[j]]$N
xA_L = dnds1$RefCDS[[j]]$L
y_A = as.numeric(dnds1$genemuts[j,-1])
exp_rel_A = y_A[5:8]/y_A[5]
shape_A = dnds1$nbreg$theta
scale_A = y_A[9]/dnds1$nbreg$theta
# Define variables for group B
xB_N = dnds2$RefCDS[[j]]$N
xB_L = dnds2$RefCDS[[j]]$L
y_B = as.numeric(dnds2$genemuts[j,-1])
exp_rel_B = y_B[5:8]/y_B[5]
shape_B = dnds2$nbreg$theta
scale_B = y_B[9]/dnds2$nbreg$theta
numrates = length(dnds1$mutrates)
indneut = 1
opt_t_A = mle_tcv(n_neutral=sum(y_A[indneut]), exp_rel_neutral=sum(exp_rel_A[indneut]), shape=shape_A, scale=scale_A)
opt_t_B = mle_tcv(n_neutral=sum(y_B[indneut]), exp_rel_neutral=sum(exp_rel_B[indneut]), shape=shape_B, scale=scale_B)
mrfold_A = max(1e-10, opt_t_A/y_A[5]) # Correction factor of "t" based on the obs/exp ratio of "neutral" mutations under the model
mrfold_B = max(1e-10, opt_t_B/y_B[5]) # Correction factor of "t" based on the obs/exp ratio of "neutral" mutations under the model
# Store obs and expected counts
n_obs_mis_A = y_A[2]
n_obs_mis_B = y_B[2]
n_obs_non_A = sum(y_A[3:4])
n_obs_non_B = sum(y_B[3:4])
n_ind_A = dnds1$sel_cv$n_ind[j]
n_ind_B = dnds2$sel_cv$n_ind[j]
n_exp_mis_A = y_A[6]
n_exp_mis_B = y_B[6]
n_exp_non_A = sum(y_A[7:8])
n_exp_non_B = sum(y_B[7:8])
exp_ind_A = dnds1$sel_cv$exp_ind[j]
exp_ind_B = dnds2$sel_cv$exp_ind[j]
# Calculate Excess for Indels
excess_ind_A = (n_ind_A - exp_ind_A) / num_patients_A
excess_ind_B = (n_ind_B - exp_ind_B) / num_patients_B
excess_ind_T = ((n_ind_A + n_ind_B) - (exp_ind_A + exp_ind_B)) / (num_patients_A + num_patients_B)
# Calculate Excess for SNVs
excess_mis_T = ( (n_obs_mis_A + n_obs_mis_B) - (n_exp_mis_A * mrfold_A + n_exp_mis_B * mrfold_B)) / (num_patients_A + num_patients_B)
excess_non_T = ( (n_obs_non_A + n_obs_non_B) - (n_exp_non_A * mrfold_A + n_exp_non_B * mrfold_B)) / (num_patients_A + num_patients_B)
excess_mis_A = (n_obs_mis_A - (n_exp_mis_A * mrfold_A)) / num_patients_A
excess_mis_B = (n_obs_mis_B - (n_exp_mis_B * mrfold_B)) / num_patients_B
excess_non_A = (n_obs_non_A - (n_exp_non_A * mrfold_A)) / num_patients_A
excess_non_B = (n_obs_non_B - (n_exp_non_B * mrfold_B)) / num_patients_B
# Calculate "dN/dS" for SNVs
wmis_T_A = 1 + ((excess_mis_T * num_patients_A) / (n_exp_mis_A * mrfold_A))
wmis_T_B = 1 + ((excess_mis_T * num_patients_B) / (n_exp_mis_B * mrfold_B))
wnon_T_A = 1 + ((excess_non_T * num_patients_A) / (n_exp_non_A * mrfold_A))
wnon_T_B = 1 + ((excess_non_T * num_patients_B) / (n_exp_non_B * mrfold_B))
wmis_A = 1 + ((excess_mis_A * num_patients_A) / (n_exp_mis_A * mrfold_A))
wmis_B = 1 + ((excess_mis_B * num_patients_B) / (n_exp_mis_B * mrfold_B))
wnon_A = 1 + ((excess_non_A * num_patients_A) / (n_exp_non_A * mrfold_A))
wnon_B = 1 + ((excess_non_B * num_patients_B) / (n_exp_non_B * mrfold_B))
# Calculate "dN/dS" for Indels
wind_A = 1 + ((excess_ind_A * num_patients_A) / exp_ind_A)
wind_B = 1 + ((excess_ind_B * num_patients_B) / exp_ind_B)
wind_T_A = 1 + ((excess_ind_T * num_patients_A) / exp_ind_A)
wind_T_B = 1 + ((excess_ind_T * num_patients_B) / exp_ind_B)
# Prevent negative dn/ds in joint estimates: in those cases, estimate the total dn/ds as (obs_A + obs_B)/(exp_A + exp_B). This is equivalent to testing for differences dn/ds instead of Delta.Nd, which is reasonable in the case of negative selection.
wmis_T_A_new = ifelse(wmis_T_A > 0 & wmis_T_B >0, wmis_T_A, (n_obs_mis_A + n_obs_mis_B)/(n_exp_mis_A * mrfold_A + n_exp_mis_B * mrfold_B))
wmis_T_B_new = ifelse(wmis_T_A > 0 & wmis_T_B >0, wmis_T_B, (n_obs_mis_A + n_obs_mis_B)/(n_exp_mis_A * mrfold_A + n_exp_mis_B * mrfold_B))
wnon_T_A_new = ifelse(wnon_T_A > 0 & wnon_T_B >0, wnon_T_A, (n_obs_non_A + n_obs_non_B)/(n_exp_non_A * mrfold_A + n_exp_non_B * mrfold_B))
wnon_T_B_new = ifelse(wnon_T_A > 0 & wnon_T_B >0, wnon_T_B, (n_obs_non_A + n_obs_non_B)/(n_exp_non_A * mrfold_A + n_exp_non_B * mrfold_B))
wind_T_A_new = ifelse(wind_T_A > 0 & wind_T_B >0, wind_T_A, (n_ind_A + n_ind_B)/(exp_ind_A + exp_ind_B))
wind_T_B_new = ifelse(wind_T_A > 0 & wind_T_B >0, wind_T_B, (n_ind_A + n_ind_B)/(exp_ind_A + exp_ind_B))
wmis_T_A = wmis_T_A_new
wmis_T_B = wmis_T_B_new
wnon_T_A = wnon_T_A_new
wnon_T_B = wnon_T_B_new
wind_T_A = wind_T_A_new
wind_T_B = wind_T_B_new
# Alternative (more onservative) approach to deal with negative dn/ds: replace them by NA. Accompany this by a note explaining that NA's are associated with cases of conditional purifying selection. In some of those cases the assumptions of the null model are violated and it is not feasible to compare differences in the number of purged mutations.
# wmis_T_A = ifelse(wmis_T_A > 0, wmis_T_A, NA)
# wmis_T_B = ifelse(wmis_T_B > 0, wmis_T_B, NA)
# wnon_T_A = ifelse(wnon_T_A > 0, wnon_T_A, NA)
# wnon_T_B = ifelse(wnon_T_B > 0, wnon_T_B, NA)
# wind_T_A = ifelse(wind_T_A > 0, wind_T_A, NA)
# wind_T_B = ifelse(wind_T_B > 0, wind_T_B, NA)
# Correct negative cases in single-group estimates (these are sometimes produced due to numerical errors and can safely be made zero)
wmis_A = ifelse(wmis_A > 0, wmis_A, 0)
wmis_B = ifelse(wmis_B > 0, wmis_B, 0)
wnon_A = ifelse(wnon_A > 0, wnon_A, 0)
wnon_B = ifelse(wnon_B > 0, wnon_B, 0)
wind_A = ifelse(wind_A > 0, wind_A, 0)
wind_B = ifelse(wind_B > 0, wind_B, 0)
theta_indels_A <- dnds1$nbregind$theta
theta_indels_B <- dnds2$nbregind$theta
# Log likelihood for Indels
#llind1 = pnbinom(q=n_ind_A-1, mu=exp_ind_A * wind_A, size=theta_indels_A, lower.tail=F,log=T) + pnbinom(q=n_ind_B-1, mu=exp_ind_B * wind_B, size=theta_indels_B, lower.tail=F,log=T)
#llind0 = pnbinom(q=n_ind_A-1, mu=exp_ind_A * wind_T_A, size=theta_indels_A, lower.tail=F,log=T) + pnbinom(q=n_ind_B-1, mu=exp_ind_B * wind_T_B, size=theta_indels_B, lower.tail=F,log=T)
llind1 = dnbinom(x=n_ind_A, mu=exp_ind_A * wind_A, size=theta_indels_A,log=T) + dnbinom(x=n_ind_B, mu=exp_ind_B * wind_B, size=theta_indels_B,log=T)
llind0 = dnbinom(x=n_ind_A, mu=exp_ind_A * wind_T_A, size=theta_indels_A,log=T) + dnbinom(x=n_ind_B, mu=exp_ind_B * wind_T_B, size=theta_indels_B,log=T)
# Log likelihood for SNV
llmis_A = sum(dpois(x=xA_N, lambda=xA_L*dnds1$mutrates*mrfold_A*t(array(c(1,wmis_T_A,wnon_A,wnon_A),dim=c(4,numrates))), log=T)) + dgamma(opt_t_A, shape=shape_A, scale=scale_A, log=T)
llmis_B = sum(dpois(x=xB_N, lambda=xB_L*dnds2$mutrates*mrfold_B*t(array(c(1,wmis_T_B,wnon_B,wnon_B),dim=c(4,numrates))), log=T)) + dgamma(opt_t_B, shape=shape_B, scale=scale_B, log=T)
llmis = llmis_A + llmis_B
llmis_check_A = sum(dpois(x=xA_N, lambda=xA_L*dnds1$mutrates*mrfold_A*t(array(c(1,wmis_A,wnon_A,wnon_A),dim=c(4,numrates))), log=T)) + dgamma(opt_t_A, shape=shape_A, scale=scale_A, log=T)
lltrunc_A = sum(dpois(x=xA_N, lambda=xA_L*dnds1$mutrates*mrfold_A*t(array(c(1,wmis_A,wnon_T_A,wnon_T_A),dim=c(4,numrates))), log=T)) + dgamma(opt_t_A, shape=shape_A, scale=scale_A, log=T)
lltrunc_B = sum(dpois(x=xB_N, lambda=xB_L*dnds2$mutrates*mrfold_B*t(array(c(1,wmis_B,wnon_T_B,wnon_T_B),dim=c(4,numrates))), log=T)) + dgamma(opt_t_B, shape=shape_B, scale=scale_B, log=T)
lltrunc = lltrunc_A + lltrunc_B
lltrunc_check_B = sum(dpois(x=xB_N, lambda=xB_L*dnds2$mutrates*mrfold_B*t(array(c(1,wmis_B,wnon_B,wnon_B),dim=c(4,numrates))), log=T)) + dgamma(opt_t_B, shape=shape_B, scale=scale_B, log=T)
llallA = sum(dpois(x=xA_N, lambda=xA_L*dnds1$mutrates*mrfold_A*t(array(c(1,wmis_T_A,wnon_T_A,wnon_T_A),dim=c(4,numrates))), log=T)) + dgamma(opt_t_A, shape=shape_A, scale=scale_A, log=T)
llallB = sum(dpois(x=xB_N, lambda=xB_L*dnds2$mutrates*mrfold_B*t(array(c(1,wmis_T_B,wnon_T_B,wnon_T_B),dim=c(4,numrates))), log=T)) + dgamma(opt_t_B, shape=shape_B, scale=scale_B, log=T)
llall = llallA + llallB # loglik free wmis, free wnon==wspl
return(c(llmis_A, llmis_B, lltrunc_A, lltrunc_B, llmis_check_A, lltrunc_check_B, llmis, lltrunc, llall, llallA, llallB, llind0, llind1))
}
# Put the data in the same order
gene_name_order = sapply(dnds1$RefCDS, function(x) x$gene_name)
dnds1$sel_cv <- dnds1$sel_cv[match(gene_name_order, dnds1$sel_cv$gene_name),]
gene_name_order = sapply(dnds2$RefCDS, function(x) x$gene_name)
dnds2$sel_cv <- dnds2$sel_cv[match(gene_name_order, dnds2$sel_cv$gene_name),]
num_patients_A <- length(as.vector(unique(dnds1$annotmuts$sampleID)))
num_patients_B <- length(as.vector(unique(dnds2$annotmuts$sampleID)))
h0_sel_cv = as.data.frame(t(sapply(1:nrow(dnds1$genemuts), my_selfun_cv)))
print("Finished log likelihood ratios")
h0_sel_cv = cbind(dnds1$genemuts[,1],h0_sel_cv)
colnames(h0_sel_cv) = c("gene_name", "llmis_A", "llmis_B", "lltrunc_A", "lltrunc_B", "llmis_check_A", "lltrunc_check_B", "llmis", "lltrunc", "llall", "llallA", "llallB", "llind0", "llind1")
return(h0_sel_cv)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.