data-raw/openrun.R

#getPrevalence<-function(snp_allelecount_df, ref_allelecount_df,phasing_association_df, major_copynumber_df,minor_copynumber_df,normalfraction_df,nbFirstColumns,tumoursamples, region=NULL)
{
  
  #data("CaseStudy_10")
  #attach(CaseStudy_10)
  
  attach(InputData)
  region=NULL
  nbFirstColumns=3
  tumoursamples=NULL
  min_cells=2
  min_alleles=4
  
  
  
  hg19_dfsize<-list(chr1=249250621,chr2=243199373,chr3=198022430, chr4=191154276, 
                    chr5=180915260, chr6=171115067, chr7=159138663, chr8=146364022, 
                    chr9=141213431, chr10=135534747, chr11=135006516, chr12=133851895, 
                    chr13=115169878, chr14=107349540,chr15=102531392, chr16=90354753,
                    chr17=81195210, chr18=78077248, chr19=59128983, chr20=63025520,
                    chr21=48129895,chr22=51304566, chrX=155270560, chrY=59373566,
                    chrM=16571)
  
  
  
  
  # Extract the somatic mutations 
  
  
  compulsory_columns=c("Chrom","End","IsGermline")
  
  if (length(setdiff(compulsory_columns,colnames(snp_allelecount_df)))>0){
    stop(" The allele count master matrices should have at least the following headers
         columns : ")
    print(compulsory_columns)
  }
  
  
  if (length(setdiff(compulsory_columns,colnames(ref_allelecount_df)))>0){
    stop(" The allele count master matrices should have at least the following 
         headers columns : ")
    print(compulsory_columns)
  }
  
  if (is.null(tumoursamples)){
    tumoursamples = colnames(snp_allelecount_df[(nbFirstColumns+1):ncol(snp_allelecount_df)])
  }
  
  tumoursamples = Reduce(intersect,list(tumoursamples,colnames(snp_allelecount_df),
                                        colnames(ref_allelecount_df),
                                        colnames(major_copynumber_df), 
                                        colnames(minor_copynumber_df),
                                        colnames(normalfraction_df)))
  if(length(tumoursamples) ==0)
  {
    stop(" None of the tumour samples provided is present in the five  matrices :
         snp_allelecount_df, ref_allelecount_df, major_copynumber_df,minor_copynumber_df,
         normalfraction_df")
  }
  
  snp_allelecount_df=numeric_column(snp_allelecount_df,tumoursamples)
  ref_allelecount_df=numeric_column(ref_allelecount_df,tumoursamples)  
  major_copynumber_df=numeric_column(major_copynumber_df,tumoursamples)
  minor_copynumber_df=numeric_column(minor_copynumber_df,tumoursamples)
  normalfraction_df=numeric_column(normalfraction_df,tumoursamples)
  
  somatic_snp_allelecount_df = snp_allelecount_df[snp_allelecount_df$IsGermline==0, ]
  
  
  
  # Region to compute the prevalence
  
  if(!is.null(region)){
    region_parts= unlist(strsplit(region,":"))
    
    chrom = region_parts[1]
    startPosition = 1
    endPosition = hg19_dfsize[chrom]
    
    somatic_snp_allelecount_df = somatic_snp_allelecount_df[somatic_snp_allelecount_df$Chrom == chrom , ]
    
    
    if(length(region_parts)>1){
      coordinates = unlist(strsplit(region_parts[2],"-"))
      startPosition = as.numeric(coordinates[1])
      endPosition = as.numeric(coordinates[2])
      somatic_snp_allelecount_df = somatic_snp_allelecount_df[somatic_snp_allelecount_df$Chrom == chrom & somatic_snp_allelecount_df$Start >= startPosition & somatic_snp_allelecount_df$End <= endPosition,]
      
    }
    
  }
  
  
  
  
  
  
  imutation=0
  
  
  masterprevalence<-matrix(nrow=nrow(somatic_snp_allelecount_df),ncol=nbFirstColumns + length(tumoursamples))
  masterprevalence<-as.data.frame(masterprevalence)
  colnames(masterprevalence) <- c(colnames(somatic_snp_allelecount_df[1:nbFirstColumns]),tumoursamples)
  rownames(masterprevalence) <- rownames(somatic_snp_allelecount_df)
  masterprevalence[1:nbFirstColumns] = somatic_snp_allelecount_df[1:nbFirstColumns]
  
  
  for (imut in 1:nrow(masterprevalence))
  {
    
    
    imutation =imutation+1; 
    mut <- rownames(masterprevalence[imut,]); mut_pos=as.numeric(masterprevalence[imut,"End"])
    
    
      output=(((imutation-1) %%100)==0)
     if(output)
       cat (" \n\n\n\n  mut ",imutation,"/ ",nrow(masterprevalence)," (", mut,") ")
    
    
    
    ############################
    #####Source of information 1: The list of phased germline
    ##############
    
    # Retrieving the phased germline, if not phased germline skip
    
    phased_germline=""
    phased_germline=  as.character(phasing_association_df[mut,"PhasedMutations"])
    
    if(is.null(phased_germline))
      next
    phased_list<-unlist(strsplit(phased_germline,":"));  
    if (length(unlist(phased_list))==0)
      next
    
    #Keep only the germline having allele counts
    phased_list=intersect(phased_list, rownames(snp_allelecount_df[snp_allelecount_df$IsGermline==1,]))
    
    if (length(unlist(phased_list))==0)
      next
    
    #  if(output) 
    #   cat ( " :  ", length(phased_list), "Phased present in the Alelle matrice")  
    
    
    
    
    
    ############################
    #####Source of information 2: The Copy Number 
    ##############
    
    
    #For the somatic mutation
    phi_cn_sample_somatic_list = 1-normalfraction_df[mut,tumoursamples,drop=F]
    Major_cn_sample_somatic_list = major_copynumber_df[mut,tumoursamples, drop=F]
    Minor_cn_sample_somatic_list = minor_copynumber_df[mut,tumoursamples,drop=F]
    Total_cn_sample_somatic_list = Major_cn_sample_somatic_list + Minor_cn_sample_somatic_list 
    
    
    #For the phased germline mutations
    phi_cn_sample_PhasedGermline_df = 1-normalfraction_df[phased_list,tumoursamples,drop=F]
    Major_cn_sample_PhasedGermline_df = major_copynumber_df[phased_list,tumoursamples,drop=F ]
    Minor_cn_sample_PhasedGermline_df = minor_copynumber_df[phased_list,tumoursamples,drop=F]
    Total_cn_sample_PhasedGermline_df = Major_cn_sample_PhasedGermline_df + Minor_cn_sample_PhasedGermline_df
    
    
    
    ############################
    #####Source of information 3: The Allele Count 
    ##############
    ############################
    
    
    #Somatic
    #wellfraction_somatic =snp_allelecount_df[mut,cifs:nbcolumns_wellfraction]
    snpwellcount_somatic =  data.matrix(snp_allelecount_df[mut,tumoursamples,drop=F])
    refwellcount_somatic = data.matrix(ref_allelecount_df[mut,tumoursamples,drop=F])
    
    #Germline
    # wellfraction_germlines=snp_allelecount_df[phased_list,cifs:nbcolumns_wellfraction]
    snpwellcount_germlines=  data.matrix(snp_allelecount_df[phased_list,tumoursamples, drop=F])
    refwellcount_germlines = data.matrix(ref_allelecount_df[phased_list,tumoursamples, drop=F])
    
    
    
    ############################
    ##### Mutations locus restriction
    ##############
    ############################
    
    # We need to pass trough each samples,  and assign NA to the entries of any phased germline appearing to not be on the same locus with the somatic mutation ( not having same phi, major_cn and minor_cn as the somatic and with no  cn breakpoint between the germline and the somatic.)
    
    submatrix_phased = snp_allelecount_df[phased_list, 1:3 , drop=F] #retrieve a submatrix of the phased with the chromosome and the position.
    
    submatrix_phased_leftside = submatrix_phased[submatrix_phased$End <mut_pos,1:3 , drop=F ] #germlines at the left of the mutation 
    submatrix_phased_rightside = submatrix_phased[submatrix_phased$End >mut_pos,1:3 , drop=F ] # germline at the right
    
    #order the leftside from highest to smallest position, leave the rightside from smallest to highest
    orders_phase=order(submatrix_phased_leftside["End"],decreasing=T)
    submatrix_phased_leftside=submatrix_phased_leftside[orders_phase,]
    
    
    #Germline
    #wellfraction_samelocus_germlines=wellfraction_germlines
    snpwellcount_samelocus_germlines= snpwellcount_germlines
    refwellcount_samelocus_germlines = refwellcount_germlines 
    
    
    at_least_one_good_germline=F # is there atleast one good germline kept?
    
    
    
    for(sample in tumoursamples)
    {
      samelocus_germline=c()
      phi_som=phi_cn_sample_somatic_list[sample]
      major_som=Major_cn_sample_somatic_list[sample]
      minor_som=Minor_cn_sample_somatic_list[sample]
      allelecount_som=snpwellcount_somatic[mut,sample]
      if (is.na(allelecount_som)) #Nothing wont be done on this sample anyway withut the snp count of the somatic.
        next
      
      absence_copynumberprofile=c()
      count_lower_than_somatic<-c() # For more accuracy in case of abundance of germline, someone can choose to exclude germline having an allele count less than the somatic allele count.
      
      for (germ in rownames(submatrix_phased_leftside))
      {
        phi_germ=phi_cn_sample_PhasedGermline_df[germ , sample]
        major_germ=Major_cn_sample_PhasedGermline_df[germ , sample]
        minor_germ=Minor_cn_sample_PhasedGermline_df[germ, sample]
        # allelecount_germ= snpwellcount_samelocus_germlines[germ,sample]
        
        if ( (phi_germ==phi_som || is.na(phi_germ)|| is.na(phi_som))&& 
             (major_germ==major_som || is.na(major_germ)|| is.na(major_som)) &&
             (minor_germ == minor_som || is.na(minor_germ)|| is.na(minor_som)))
        {
          samelocus_germline<-c(samelocus_germline,germ)
        }else
        {
          next
        }
        
        if(is.na(phi_germ) || is.na(major_germ))# We dont want NA to be considered as a breakpoint, but if the copy number information is absent we need to not consider this germline at that particular sample
          absence_copynumberprofile=c(absence_copynumberprofile,germ)
        
        # if(allelecount_germ< 1 * allelecount_som & !is.na(allelecount_germ))
        #   count_lower_than_somatic<-c( count_lower_than_somatic, germ)
        
        
      }
      
      for (germ in rownames(submatrix_phased_rightside))
      {
        phi_germ=phi_cn_sample_PhasedGermline_df[germ , sample]
        major_germ=Major_cn_sample_PhasedGermline_df[germ , sample]
        minor_germ=Minor_cn_sample_PhasedGermline_df[germ, sample]
        # allelecount_germ= snpwellcount_samelocus_germlines[germ,sample]
        
        if ( (phi_germ==phi_som || is.na(phi_germ)|| is.na(phi_som))&& 
             (major_germ==major_som || is.na(major_germ)|| is.na(major_som)) && 
             (minor_germ == minor_som || is.na(minor_germ)|| is.na(minor_som)))
        {
          samelocus_germline<-c(samelocus_germline,germ)
        }else
        {
          next
        }
        
        if(is.na(phi_germ) || is.na(major_germ))
          absence_copynumberprofile=c(absence_copynumberprofile,germ)
        
        # if(allelecount_germ< 1 * allelecount_som & !is.na(allelecount_germ))
        #   count_lower_than_somatic<-c( count_lower_than_somatic, germ)
        
        
      }
      
      #if we assign NA to the alelle count information of germline mutation not on the same somatic mutation, we are removing them from the list at the considered sample
      notsamelocus_germline = setdiff(phased_list, samelocus_germline)
      germline_to_exclude_at_this_sample=c(notsamelocus_germline,absence_copynumberprofile)
      
      if(!at_least_one_good_germline)
        at_least_one_good_germline=(length(germline_to_exclude_at_this_sample)< length(phased_list))
      
      
      if(length(germline_to_exclude_at_this_sample) ==0) next
      
      
      
      #  wellfraction_samelocus_germlines[germline_to_exclude_at_this_sample,sample]= rep(NA, length(germline_to_exclude_at_this_sample))
      snpwellcount_samelocus_germlines[germline_to_exclude_at_this_sample,sample]= rep(NA, length(germline_to_exclude_at_this_sample))
      refwellcount_samelocus_germlines [germline_to_exclude_at_this_sample,sample]= rep(NA, length(germline_to_exclude_at_this_sample))
      
    }
   
    if(!  at_least_one_good_germline) next # there is no good gerline
    
    ############################
    ##### We prepare the input for the formulw
    ##############
    ############################
    
    #Allele Count
    ############
    # The somatic mutation
    lambda_somatic_list=snpwellcount_somatic[mut, tumoursamples,drop=F] # \lambda(S) across the samples (see paper)
    
    # The germline  mutation, approximation according to poisson distribution, see paper
    lambda_PhasedGermline_list<-colMeans(data.matrix(snpwellcount_samelocus_germlines), na.rm=T)  #  for \lambda(G)
    mu_PhasedGermline_list<-colMeans(data.matrix(refwellcount_samelocus_germlines), na.rm=T) # for \mu(G)
    
    
    
    omega_PhasedGermline_mean_list<-lambda_PhasedGermline_list/(lambda_PhasedGermline_list+mu_PhasedGermline_list) # for \omega(G)
    
    #Copy number profile (simply the one of the somatic mutation locus)
    ############
    phi_cn_list= unlist(phi_cn_sample_somatic_list)
    Major_cn_list= unlist(Major_cn_sample_somatic_list)
    Minor_cn_list = unlist(Minor_cn_sample_somatic_list)
    
    
    
    ##########
    #THE Allele count MODEL
    #########
    ########
    
    
    
    #First, we separate the samples according to their copy number profile. 
    CNV_groups<-list()
    for (phival in unique(phi_cn_list)) 
      for(majorval in unique(Major_cn_list)) 
        for (minorval in unique(Minor_cn_list))
        {
          
          if(!is.na(phival) && !is.na(majorval) && !is.na(minorval))
          {
            samplecnvgroups=c()
            
            for (sample in tumoursamples)
              if(!is.na(phi_cn_list[sample]) && !is.na(Major_cn_list[sample]) && !is.na(Minor_cn_list[sample]))
                if ((phi_cn_list[sample] ==phival) &&(Major_cn_list[sample]==majorval) && (Minor_cn_list[sample]== minorval ))
                  samplecnvgroups=c(samplecnvgroups,sample)
                if(length(samplecnvgroups)!=0)
                  CNV_groups[length(CNV_groups) +1] = list(samplecnvgroups)
          }
        }
    
    
    
    
    
    sigma_PhasedGermline_list=Major_cn_list-Major_cn_list
    tau_PhasedGermline_list= phi_cn_list * sigma_PhasedGermline_list + (1-phi_cn_list) # for \tau(G)
    hatlambda_somatic_list=lambda_somatic_list-lambda_somatic_list
    hatlambda_PhasedGermline_list=lambda_PhasedGermline_list-lambda_PhasedGermline_list
    condition_list=rep(NA,length(lambda_somatic_list))
    names(condition_list)= names(lambda_somatic_list)
    
    
    #We run the model on each group
    for (icnvgroup in 1 : length(CNV_groups))
    {
      
      sample_cnvgroup=CNV_groups[[icnvgroup]]
      
      
      # Estimate of sigma. See the paper
      #################################
      A=0
      B=0
      Asnp=0
      Bref=0
      for (sample_i in sample_cnvgroup)
      {
        
        
        for ( germlinemut in rownames(snpwellcount_samelocus_germlines) )
        {
          if (!is.na(snpwellcount_samelocus_germlines[germlinemut,sample_i]) #&& (snpwellcount_samelocus_germlines[germlinemut,sample_i]>0)
          )
            Asnp= Asnp + snpwellcount_samelocus_germlines[germlinemut,sample_i]
          if (!is.na(refwellcount_samelocus_germlines[germlinemut,sample_i]) #&& (refwellcount_samelocus_germlines[germlinemut,sample_i]>0)
          )
            Bref=Bref+refwellcount_samelocus_germlines[germlinemut,sample_i]
          
          if(!is.na(snpwellcount_samelocus_germlines[germlinemut,sample_i]) #&& (snpwellcount_samelocus_germlines[germlinemut,sample_i]>0) 
             &&
             !is.na(refwellcount_samelocus_germlines[germlinemut,sample_i]) #&& (refwellcount_samelocus_germlines[germlinemut,sample_i]>0)
          )
          {
            A= A + snpwellcount_samelocus_germlines[germlinemut,sample_i]
            B=B+refwellcount_samelocus_germlines[germlinemut,sample_i]
          }
        }
      }
      
      if((is.na(A)) || (is.na(B)))
      {
        A=Asnp
        B=Bref
      }
      if(A>=B)
        sigma_PhasedGermline_list[sample_cnvgroup] = Major_cn_list[sample_cnvgroup] 
      if(A<B)
        sigma_PhasedGermline_list[sample_cnvgroup] = Minor_cn_list[sample_cnvgroup] 
      
      
      tau_PhasedGermline_list[sample_cnvgroup]= phi_cn_list[sample_cnvgroup] * sigma_PhasedGermline_list[sample_cnvgroup] + (1-phi_cn_list[sample_cnvgroup]) # for \tau(G)
      
      
      #Allele_Count
      alpha_list=1/tau_PhasedGermline_list[sample_cnvgroup]
      beta_list=(phi_cn_list[sample_cnvgroup]  * sigma_PhasedGermline_list[sample_cnvgroup] )/ tau_PhasedGermline_list[sample_cnvgroup]
      
      #condition_list2=lambda_somatic_list - alpha_condition * lambda_PhasedGermline_list# The condition, if >=0 then case 1 else case 2
      lambda_S=as.numeric(lambda_somatic_list[mut,sample_cnvgroup])
      lambda_G=data.matrix(snpwellcount_samelocus_germlines[phased_list,sample_cnvgroup,drop=F])
      alpha=as.numeric(alpha_list[sample_cnvgroup])
      beta=as.numeric(beta_list[sample_cnvgroup])
      
      EM_parameters=bestAllele(lambda_S, lambda_G, alpha, beta)
      
      
      
      hatlambda_somatic_list[mut, sample_cnvgroup] =EM_parameters$hatlambda_S
      hatlambda_PhasedGermline_list[sample_cnvgroup] =EM_parameters$hatlambda_G
      condition=EM_parameters$bestC
      condition_list[sample_cnvgroup]=rep(condition, length(lambda_S))
      
      
    }
    
    
    
    ############################
    #####   ############################
    ##### Intermediary computation
    ##############
    ############################
    
    u_list =phi_cn_list * hatlambda_PhasedGermline_list/tau_PhasedGermline_list # for u
    v_list=(1-phi_cn_list) * hatlambda_PhasedGermline_list/tau_PhasedGermline_list # for v
    
    sum_cells_list<-u_list+v_list # estimation of the number of cells if well counts and the K * the number of cells of reads count with k = amplification factor
    sum_allele_list= hatlambda_PhasedGermline_list +  mu_PhasedGermline_list # estimation of the number of allele
    
    
    
    ############################
    ##### Prevalence computation
    ##############
    ############################
    
    #`The user should specify a minimum count of cells and allele for the prevalence to be computed. By default we take 6 cells and 8 alleles.
#     min_cells=4
#     min_alleles=6
#     
    
    
    prev_somatic_list =vector("numeric", length=length(hatlambda_somatic_list))
    names(prev_somatic_list) = colnames(hatlambda_somatic_list)
    prev_somatic_list[prev_somatic_list==0]<-NA
    
    
    for(sample in tumoursamples)
    {
      
      if (is.na(condition_list[sample]))
        next
      
      if(is.na(sum_cells_list[sample]) || is.na(sum_allele_list[sample]))
        next
      if (sum_cells_list[sample]< min_cells)
        next
      if(sum_allele_list[sample]< min_alleles)
        next

      if (condition_list[sample]=="C2")#CONTEXTE 2
        prev_somatic_list[sample] = as.numeric(unlist(   phi_cn_list[sample] + (1-phi_cn_list[sample]) * ((hatlambda_somatic_list[mut,sample] - u_list[sample] * sigma_PhasedGermline_list[sample])/v_list[sample])   ))
      else if (condition_list[sample]=="C1")
        prev_somatic_list[sample] = as.numeric(unlist( (hatlambda_somatic_list[mut,sample] / hatlambda_PhasedGermline_list[sample])* tau_PhasedGermline_list[sample]    ))
      
      #if(sample=="O13_A_ABpre")
      #   exit()
      #if(!is.na(prev_somatic_list[sample]) && prev_somatic_list[sample] > 1.00000001 ) exit()
      #if(!is.na(prev_somatic_list[sample]) && prev_somatic_list[sample] < 0 ) exit()
      
    }
    
    
    masterprevalence[mut,names(prev_somatic_list)] = prev_somatic_list
    
    
    
    masterprevalence[mut,names(prev_somatic_list)] = prev_somatic_list
    
    list_prev=unlist(prev_somatic_list)
    list_prev=list_prev[!is.na(list_prev)]
    
    
    
    
  }
  
  
  masterprevalence
  
  
  
  
  
}
chedonat/OncoPhase documentation built on May 13, 2019, 3:39 p.m.