R/lik_func_ex.R

Defines functions lik_func

# dnds1 and dnds2 are dndsout objects generated by running dndscv on a list of known cancer genes in two datasets.

lik_func = function(dnds1, dnds2) {
  
  # Subfunction: Analytical opt_t using only neutral subs
  mle_tcv = function(n_neutral, exp_rel_neutral, shape, scale) {
    tml = (n_neutral+shape-1)/(exp_rel_neutral+(1/scale))
    if (shape<=1) { # i.e. when theta<=1
      tml = max(shape*scale,tml) # i.e. tml is bounded to the mean of the gamma (i.e. y[9]) when theta<=1, since otherwise it takes meaningless values
    }
    return(tml)
  }
  
  my_selfun_cv = function(j) {
    
    # Define variables for group A
    xA_N = dnds1$RefCDS[[j]]$N
    xA_L = dnds1$RefCDS[[j]]$L
    y_A = as.numeric(dnds1$genemuts[j,-1])
    exp_rel_A = y_A[5:8]/y_A[5]
    shape_A = dnds1$nbreg$theta
    scale_A = y_A[9]/dnds1$nbreg$theta
    
    # Define variables for group B
    xB_N = dnds2$RefCDS[[j]]$N
    xB_L = dnds2$RefCDS[[j]]$L
    y_B = as.numeric(dnds2$genemuts[j,-1])
    exp_rel_B = y_B[5:8]/y_B[5]
    shape_B = dnds2$nbreg$theta
    scale_B = y_B[9]/dnds2$nbreg$theta
    
    numrates = length(dnds1$mutrates)
    
    indneut = 1
    opt_t_A = mle_tcv(n_neutral=sum(y_A[indneut]), exp_rel_neutral=sum(exp_rel_A[indneut]), shape=shape_A, scale=scale_A)
    opt_t_B = mle_tcv(n_neutral=sum(y_B[indneut]), exp_rel_neutral=sum(exp_rel_B[indneut]), shape=shape_B, scale=scale_B)
    mrfold_A = max(1e-10, opt_t_A/y_A[5]) # Correction factor of "t" based on the obs/exp ratio of "neutral" mutations under the model
    mrfold_B = max(1e-10, opt_t_B/y_B[5]) # Correction factor of "t" based on the obs/exp ratio of "neutral" mutations under the model
    
    # Store obs and expected counts
    n_obs_mis_A = y_A[2]
    n_obs_mis_B = y_B[2]
    n_obs_non_A = sum(y_A[3:4])
    n_obs_non_B = sum(y_B[3:4])
    n_ind_A     = dnds1$sel_cv$n_ind[j]
    n_ind_B     = dnds2$sel_cv$n_ind[j]
    
    n_exp_mis_A = y_A[6]
    n_exp_mis_B = y_B[6]
    n_exp_non_A = sum(y_A[7:8])
    n_exp_non_B = sum(y_B[7:8])
    exp_ind_A   = dnds1$sel_cv$exp_ind[j]
    exp_ind_B   = dnds2$sel_cv$exp_ind[j]
    
    # Calculate Excess for Indels
    excess_ind_A = (n_ind_A - exp_ind_A) / num_patients_A
    excess_ind_B = (n_ind_B - exp_ind_B) / num_patients_B
    excess_ind_T = ((n_ind_A + n_ind_B) - (exp_ind_A + exp_ind_B)) / (num_patients_A + num_patients_B)
    
    # Calculate Excess for SNVs
    excess_mis_T = ( (n_obs_mis_A + n_obs_mis_B) - (n_exp_mis_A * mrfold_A + n_exp_mis_B * mrfold_B)) / (num_patients_A + num_patients_B)
    excess_non_T = ( (n_obs_non_A + n_obs_non_B) - (n_exp_non_A * mrfold_A + n_exp_non_B * mrfold_B)) / (num_patients_A + num_patients_B)
    
    excess_mis_A = (n_obs_mis_A - (n_exp_mis_A * mrfold_A)) / num_patients_A
    excess_mis_B = (n_obs_mis_B - (n_exp_mis_B * mrfold_B)) / num_patients_B
    excess_non_A = (n_obs_non_A - (n_exp_non_A * mrfold_A)) / num_patients_A
    excess_non_B = (n_obs_non_B - (n_exp_non_B * mrfold_B)) / num_patients_B
    
    # Calculate "dN/dS" for SNVs
    wmis_T_A = 1 + ((excess_mis_T * num_patients_A) / (n_exp_mis_A * mrfold_A))
    wmis_T_B = 1 + ((excess_mis_T * num_patients_B) / (n_exp_mis_B * mrfold_B))
    wnon_T_A = 1 + ((excess_non_T * num_patients_A) / (n_exp_non_A * mrfold_A))
    wnon_T_B = 1 + ((excess_non_T * num_patients_B) / (n_exp_non_B * mrfold_B))
    
    wmis_A = 1 + ((excess_mis_A * num_patients_A) / (n_exp_mis_A * mrfold_A))
    wmis_B = 1 + ((excess_mis_B * num_patients_B) / (n_exp_mis_B * mrfold_B))
    wnon_A = 1 + ((excess_non_A * num_patients_A) / (n_exp_non_A * mrfold_A))
    wnon_B = 1 + ((excess_non_B * num_patients_B) / (n_exp_non_B * mrfold_B))
    
    # Calculate "dN/dS" for Indels
    wind_A   = 1 + ((excess_ind_A * num_patients_A) / exp_ind_A)
    wind_B   = 1 + ((excess_ind_B * num_patients_B) / exp_ind_B)
    wind_T_A = 1 + ((excess_ind_T * num_patients_A) / exp_ind_A)
    wind_T_B = 1 + ((excess_ind_T * num_patients_B) / exp_ind_B)
    
    # Prevent negative dn/ds in joint estimates: in those cases, estimate the total dn/ds as (obs_A + obs_B)/(exp_A + exp_B). This is equivalent to testing for differences dn/ds instead of Delta.Nd, which is reasonable in the case of negative selection. 
    wmis_T_A_new = ifelse(wmis_T_A > 0 & wmis_T_B >0, wmis_T_A, (n_obs_mis_A + n_obs_mis_B)/(n_exp_mis_A * mrfold_A + n_exp_mis_B * mrfold_B))
    wmis_T_B_new = ifelse(wmis_T_A > 0 & wmis_T_B >0, wmis_T_B, (n_obs_mis_A + n_obs_mis_B)/(n_exp_mis_A * mrfold_A + n_exp_mis_B * mrfold_B))
    wnon_T_A_new = ifelse(wnon_T_A > 0 & wnon_T_B >0, wnon_T_A, (n_obs_non_A + n_obs_non_B)/(n_exp_non_A * mrfold_A + n_exp_non_B * mrfold_B))
    wnon_T_B_new = ifelse(wnon_T_A > 0 & wnon_T_B >0, wnon_T_B, (n_obs_non_A + n_obs_non_B)/(n_exp_non_A * mrfold_A + n_exp_non_B * mrfold_B))
    wind_T_A_new = ifelse(wind_T_A > 0 & wind_T_B >0, wind_T_A, (n_ind_A + n_ind_B)/(exp_ind_A + exp_ind_B))
    wind_T_B_new = ifelse(wind_T_A > 0 & wind_T_B >0, wind_T_B, (n_ind_A + n_ind_B)/(exp_ind_A + exp_ind_B))
    wmis_T_A = wmis_T_A_new
    wmis_T_B = wmis_T_B_new
    wnon_T_A = wnon_T_A_new
    wnon_T_B = wnon_T_B_new
    wind_T_A = wind_T_A_new
    wind_T_B = wind_T_B_new
    
    # Alternative (more onservative) approach to deal with negative dn/ds: replace them by NA. Accompany this by a note explaining that NA's are associated with cases of conditional purifying selection. In some of those cases the assumptions of the null model are violated and it is not feasible to compare differences in the number of purged mutations.
    # wmis_T_A = ifelse(wmis_T_A > 0, wmis_T_A, NA)
    # wmis_T_B = ifelse(wmis_T_B > 0, wmis_T_B, NA)
    # wnon_T_A = ifelse(wnon_T_A > 0, wnon_T_A, NA)
    # wnon_T_B = ifelse(wnon_T_B > 0, wnon_T_B, NA)
    # wind_T_A = ifelse(wind_T_A > 0, wind_T_A, NA)
    # wind_T_B = ifelse(wind_T_B > 0, wind_T_B, NA)
    
    # Correct negative cases in single-group estimates (these are sometimes produced due to numerical errors and can safely be made zero)
    wmis_A = ifelse(wmis_A > 0, wmis_A, 0)
    wmis_B = ifelse(wmis_B > 0, wmis_B, 0)
    wnon_A = ifelse(wnon_A > 0, wnon_A, 0)
    wnon_B = ifelse(wnon_B > 0, wnon_B, 0)
    wind_A = ifelse(wind_A > 0, wind_A, 0)
    wind_B = ifelse(wind_B > 0, wind_B, 0)
    
    theta_indels_A <- dnds1$nbregind$theta
    theta_indels_B <- dnds2$nbregind$theta
    
    # Log likelihood for Indels
    #llind1 = pnbinom(q=n_ind_A-1, mu=exp_ind_A * wind_A,   size=theta_indels_A, lower.tail=F,log=T) + pnbinom(q=n_ind_B-1, mu=exp_ind_B * wind_B,   size=theta_indels_B, lower.tail=F,log=T)
    #llind0 = pnbinom(q=n_ind_A-1, mu=exp_ind_A * wind_T_A, size=theta_indels_A, lower.tail=F,log=T) + pnbinom(q=n_ind_B-1, mu=exp_ind_B * wind_T_B, size=theta_indels_B, lower.tail=F,log=T)
    llind1 = dnbinom(x=n_ind_A, mu=exp_ind_A * wind_A,   size=theta_indels_A,log=T) + dnbinom(x=n_ind_B, mu=exp_ind_B * wind_B,   size=theta_indels_B,log=T)
    llind0 = dnbinom(x=n_ind_A, mu=exp_ind_A * wind_T_A, size=theta_indels_A,log=T) + dnbinom(x=n_ind_B, mu=exp_ind_B * wind_T_B, size=theta_indels_B,log=T)
    
    # Log likelihood for SNV
    llmis_A = sum(dpois(x=xA_N, lambda=xA_L*dnds1$mutrates*mrfold_A*t(array(c(1,wmis_T_A,wnon_A,wnon_A),dim=c(4,numrates))), log=T)) + dgamma(opt_t_A, shape=shape_A, scale=scale_A, log=T)
    llmis_B = sum(dpois(x=xB_N, lambda=xB_L*dnds2$mutrates*mrfold_B*t(array(c(1,wmis_T_B,wnon_B,wnon_B),dim=c(4,numrates))), log=T)) + dgamma(opt_t_B, shape=shape_B, scale=scale_B, log=T)
    llmis = llmis_A + llmis_B
    llmis_check_A = sum(dpois(x=xA_N, lambda=xA_L*dnds1$mutrates*mrfold_A*t(array(c(1,wmis_A,wnon_A,wnon_A),dim=c(4,numrates))), log=T)) + dgamma(opt_t_A, shape=shape_A, scale=scale_A, log=T)
    
    lltrunc_A = sum(dpois(x=xA_N, lambda=xA_L*dnds1$mutrates*mrfold_A*t(array(c(1,wmis_A,wnon_T_A,wnon_T_A),dim=c(4,numrates))), log=T)) + dgamma(opt_t_A, shape=shape_A, scale=scale_A, log=T)
    lltrunc_B = sum(dpois(x=xB_N, lambda=xB_L*dnds2$mutrates*mrfold_B*t(array(c(1,wmis_B,wnon_T_B,wnon_T_B),dim=c(4,numrates))), log=T)) + dgamma(opt_t_B, shape=shape_B, scale=scale_B, log=T)
    lltrunc = lltrunc_A + lltrunc_B
    lltrunc_check_B = sum(dpois(x=xB_N, lambda=xB_L*dnds2$mutrates*mrfold_B*t(array(c(1,wmis_B,wnon_B,wnon_B),dim=c(4,numrates))), log=T)) + dgamma(opt_t_B, shape=shape_B, scale=scale_B, log=T)
    
    llallA = sum(dpois(x=xA_N, lambda=xA_L*dnds1$mutrates*mrfold_A*t(array(c(1,wmis_T_A,wnon_T_A,wnon_T_A),dim=c(4,numrates))), log=T)) + dgamma(opt_t_A, shape=shape_A, scale=scale_A, log=T)
    llallB = sum(dpois(x=xB_N, lambda=xB_L*dnds2$mutrates*mrfold_B*t(array(c(1,wmis_T_B,wnon_T_B,wnon_T_B),dim=c(4,numrates))), log=T)) + dgamma(opt_t_B, shape=shape_B, scale=scale_B, log=T)
    llall  = llallA + llallB # loglik free wmis, free wnon==wspl
    
    return(c(llmis_A, llmis_B, lltrunc_A, lltrunc_B, llmis_check_A, lltrunc_check_B, llmis, lltrunc, llall, llallA, llallB, llind0, llind1))
  }
  
  # Put the data in the same order
  gene_name_order = sapply(dnds1$RefCDS, function(x) x$gene_name)
  dnds1$sel_cv <- dnds1$sel_cv[match(gene_name_order, dnds1$sel_cv$gene_name),]
  gene_name_order = sapply(dnds2$RefCDS, function(x) x$gene_name)
  dnds2$sel_cv <- dnds2$sel_cv[match(gene_name_order, dnds2$sel_cv$gene_name),]
  num_patients_A <- length(as.vector(unique(dnds1$annotmuts$sampleID)))
  num_patients_B <- length(as.vector(unique(dnds2$annotmuts$sampleID)))

  h0_sel_cv = as.data.frame(t(sapply(1:nrow(dnds1$genemuts), my_selfun_cv)))
  print("Finished log likelihood ratios")
  h0_sel_cv = cbind(dnds1$genemuts[,1],h0_sel_cv)
  colnames(h0_sel_cv) = c("gene_name", "llmis_A", "llmis_B", "lltrunc_A", "lltrunc_B", "llmis_check_A", "lltrunc_check_B", "llmis", "lltrunc", "llall", "llallA", "llallB", "llind0", "llind1")
  
  return(h0_sel_cv)
}
ggruenhagen3/coselens documentation built on April 17, 2025, 11:56 a.m.