reproduce_2.R
In QuantumClone: Clustering Mutations using High Throughput Sequencing (HTS) Data

### Source code to reproduce the filtering pipeline

strcount <- function(x, pattern='', split=''){
  unlist(lapply(strsplit(x, split),function(z) na.omit(length(grep(pattern, z)))))
}

### Create stringent dataset
QuantumCat_stringent<-function(number_of_clones = 6,number_of_mutations = 200,
                               ploidy = "AB",depth = 100,
                               contamination = c(0.3,0.4),min_depth = 50){
  if(depth<min_depth){
    error("The depth is lower than the minimal depth")
  }
  variants<-QuantumCat(number_of_clones = number_of_clones,number_of_mutations = number_of_mutations,
                       ploidy = ploidy,depth = depth,number_of_samples= 2,
                       contamination = contamination)
  
  test<-{variants[[1]]$Depth<min_depth} | {variants[[2]]$Depth < min_depth}
  
  while(sum(test)>0){ ### Check that all variants are above 50X depth
    for(i in 1:2){
      sparetest<-variants[[i]]$Depth<min_depth
      variants[[i]]$Depth[sparetest]<-rnbinom(n=sum(sparetest),size = 4.331601,mu = depth)
      variants[[i]]$Alt[sparetest]<-rbinom(n = sum(sparetest),
                                           size=variants[[i]]$Depth[sparetest],
                                           prob= variants[[i]]$Freq[sparetest]/100
      )
      
    }
    test<-{variants[[1]]$Depth<min_depth} | {variants[[2]]$Depth < min_depth}
  }
  return(variants)
}

QuantumCat_permissive<-function(fromQuantumCat,number_of_mutations = 200,
                                ploidy = "AB",depth = 100,
                                contamination = c(0.3,0.4),max_depth = 50, min_depth = 30){
  if(depth<min_depth | max_depth < min_depth){
    error("Depth error")
  }
  clones<-1:max(fromQuantumCat[[1]]$Chr) # retrieve clones
  
  # Now retrieve cellularity of each clone:
  cells<-list()
  variants<-list()
  chr<-sample(1:max(fromQuantumCat[[1]]$Chr),size = number_of_mutations,
              replace = TRUE)
  Start<-(1:number_of_mutations)+nrow(fromQuantumCat[[1]])
  for(i in 1:2){
    cells[[i]]<-sapply(clones,function(z){
      fromQuantumCat[[1]]$Cellularit[fromQuantumCat[[1]]$Chr==z][1]
    }
    )
    variants[[i]]<-data.frame(Chr = chr,Start = Start,
                              Cellularit = cells[[i]][chr],
                              Genotype = rep(c("AB","AAB","AABB"),times = c(round(number_of_mutations/4),
                                                                            round(number_of_mutations/2),
                                                                            round(number_of_mutations/4)
                              )),
                              number_of_copies = c(rep(1,times = round(number_of_mutations/4)),
                                                   sample(1:2,size = round(3*number_of_mutations/4),replace = TRUE)
                              ),
                              Depth  = rnbinom(n=number_of_mutations,size = 4.331601,mu = depth)
    )
    variants[[i]]$Frequency<-variants[[i]]$Cellularit*
      variants[[i]]$number_of_copies/(strcount(as.character(variants[[i]]$Genotype)))*
      (1-contamination[i])
    
    variants[[i]]$Alt<-rbinom(n = number_of_mutations,
                              size=variants[[i]]$Depth,
                              prob= variants[[i]]$Freq/100
    )
  }
  
  
  
  test<-{variants[[1]]$Depth<min_depth} | {variants[[2]]$Depth < min_depth} | 
  {variants[[1]]$Depth>max_depth & variants[[1]]$Genotype == "AB"} | 
  {variants[[2]]$Depth > max_depth  & variants[[2]]$Genotype == "AB"}
  while(sum(test)>0){ ### Check that all variants are above 50X depth
    for(i in 1:2){
      sparetest<-variants[[i]]$Depth<min_depth | {variants[[i]]$Depth > max_depth  & variants[[i]]$Genotype == "AB"} 
      variants[[i]]$Depth[sparetest]<-rnbinom(n=sum(sparetest),size = 4.331601,mu = depth)
      variants[[i]]$Alt[sparetest]<-rbinom(n = sum(sparetest),
                                           size=variants[[i]]$Depth[sparetest],
                                           prob= variants[[i]]$Freq[sparetest]/100
      )
      
    }
    test<-{variants[[1]]$Depth<min_depth} | {variants[[2]]$Depth < min_depth} | 
    {variants[[1]]$Depth>max_depth & variants[[1]]$Genotype == "AB"} | 
    {variants[[2]]$Depth > max_depth  & variants[[2]]$Genotype == "AB"}
    
  }
  return(variants)
}


#### Create pipelines:
# Starting with standard
paper_pipeline<- function(filtered,
                          permissive,
                          drivers_id,
                          contamination = c(0.3,0.4)
                          ){
  clustering<-One_step_clustering(SNV_list = filtered,
                                  contamination = contamination,
                                  nclone_range = 2:10,
                                  Init =  2,ncores = 2,
                                  save_plot = FALSE
  )
  
  ext<-list()
  for(i in 1:length(filtered)){
    
    ext[[i]]<-permissive[[i]][drivers_id[drivers_id>max(filtered[[1]]$Start)]-max(filtered[[1]]$Start),]
    #print(drivers_l[[i]])
  }
  
  clustering$driver_info<-Probability.to.belong.to.clone(SNV_list = ext,
                                                         clone_prevalence = clustering$EM.output$centers,
                                                         contamination = contamination,
                                                         clone_weights= clustering$EM.output$weights)
  ### Hard clustering
  ### adding a posteriori clustered data
  for(i in 1:length(filtered)){
    clustering$filtered.data[[i]]<-rbind(clustering$filtered.data[[i]],
                                         cbind(clustering$driver_info$filtered_data[[i]],
                                               ext[[i]][,c("Cellularit","number_of_copies","Frequency")])
                                         )
  }
  clustering$cluster<-c(clustering$cluster,clustering$driver_info$cluster)
  ### adding data from a posteriori clustering

  return(clustering)
  
}

# Creating extended pipeline:

extended<- function(filtered,
                    permissive,
                    drivers_id,
                    contamination = c(0.3,0.4)){
  input<-list()
  for(i in 1:length(permissive)){
    input[[i]]<-rbind(filtered[[i]],
                      permissive[[i]][drivers_id[drivers_id>max(filtered[[1]]$Start)]-max(filtered[[1]]$Start),])
    
  }
  clustering<-One_step_clustering(SNV_list = input,
                                  contamination = contamination,
                                  nclone_range = 2:10,
                                  Init =  2,ncores = 2,
                                  save_plot = FALSE
  )
  
  clustering$driver_clust<-clustering$cluster[clustering$filtered.data[[1]]$Start %in% drivers_id]
  return(clustering)
}

All<-function(filtered,
              permissive,
              drivers_id,
              contamination = c(0.3,0.4)){
  input<-list()
  for(i in 1:length(permissive)){
    input[[i]]<-rbind(filtered[[i]],
                      permissive[[i]])
    
  }
  clustering<-One_step_clustering(SNV_list = input,
                                  contamination = contamination,
                                  nclone_range = 2:10,
                                  Init =  2,ncores = 4,
                                  save_plot = FALSE
  )
  
  ### keep only 215 meaningful
  keep<-clustering$filtered.data[[1]]$Start %in% c(filtered[[1]]$Start,drivers_id)
  for(s in 1:2){
    clustering$filtered.data[[s]]<-clustering$filtered.data[[s]][keep,]
  }
  clustering$cluster<-clustering$cluster[keep]
  clustering$driver_clust<-clustering$cluster[clustering$filtered.data[[1]]$Start %in% drivers_id]
  return(clustering)
}

### Now we are going to compare quality of clustering: NMI, maximal distance to closest clone, number of clusters,
### average error and maximal error for drivers

compare_qual<-function(paper,
                       extended,
                       all,
                       drivers_id){
  
  # NMI
  NMI<-c(Compute_NMI(paper),
         Compute_NMI(extended),
         Compute_NMI(all)
  )
  
  cellularities<-lapply(X = paper$filtered.data,FUN = function(df){
    sapply(unique(df$Chr),FUN = function(ch){
      df$Cellularit[df$Chr==ch][1]/100
    }
    ) 
  }
  )

  #maximal distance to closest clone
  Max.Distance.to.clone<-c(
    MaxDistance(cellularit = cellularities, cluster_cells = paper$EM.output$normalized.centers),
    MaxDistance(cellularit = cellularities, cluster_cells = extended$EM.output$normalized.centers),
    MaxDistance(cellularit = cellularities, cluster_cells = all$EM.output$normalized.centers)
  )
  #number clusters
  
  nclusters<-c(length(paper$EM.output$centers[[1]]),
               length(extended$EM.output$centers[[1]]),
               length(all$EM.output$centers[[1]])
  )
  # average and max error for drivers
  # TODO:
  # * Create test.driver that gets cellularity of drivers
  # * Create retrieve.driver that retrieves true clone of drivers (also get 5 "easy" drivers)
  test.driver<-paper$filtered.data[[1]]$Start %in% drivers_id
  pap.err<-sqrt({
    cellularities[[1]][paper$filtered.data[[1]]$Chr[test.driver]]-
      paper$EM.output$normalized.centers[[1]][paper$cluster[test.driver]]}**2+
      {
        cellularities[[2]][paper$filtered.data[[1]]$Chr[test.driver]]-
          paper$EM.output$normalized.centers[[2]][paper$cluster[test.driver]]}**2
  )

  pap.mut.err<-sqrt({
    paper$filtered.data[[1]]$Cellularit/100-
      paper$EM.output$normalized.centers[[1]][paper$cluster]}**2+
      {
        paper$filtered.data[[2]]$Cellularit/100-
          paper$EM.output$normalized.centers[[2]][paper$cluster]}**2
  )

  ext.test<-cnum(extended$filtered.data[[1]]$Start) %in% cnum(drivers_id)
  ext.Chr<-extended$filtered.data[[1]]$Chr[ext.test]
  ext.clust<-extended$cluster[ext.test]
  

  ext.err<-sqrt({cellularities[[1]][ext.Chr]-
      extended$EM.output$normalized.centers[[1]][ext.clust]}**2+
      {cellularities[[2]][ext.Chr]-
          extended$EM.output$normalized.centers[[2]][ext.clust]}**2
  )
  
  ext.mut.err<-sqrt({extended$filtered.data[[1]]$Cellularit/100-
      extended$EM.output$normalized.centers[[1]][extended$cluster]}**2+
      {extended$filtered.dat[[2]]$Cellularit/100-
          extended$EM.output$normalized.centers[[2]][extended$cluster]}**2
  )
  all.test<-cnum(all$filtered.data[[1]]$Start) %in% cnum(drivers_id)
  all.Chr<-all$filtered.data[[1]]$Chr[all.test]
  all.clust<-all$cluster[all.test]
  
  
  all.err<-sqrt({cellularities[[1]][all.Chr]-
      all$EM.output$normalized.centers[[1]][all.clust]}**2+
      {cellularities[[2]][all.Chr]-
          all$EM.output$normalized.centers[[2]][all.clust]}**2
  )
  all.mut.err<-sqrt({all$filtered.data[[1]]$Cellularit/100-
      all$EM.output$normalized.centers[[1]][all$cluster]}**2+
      {all$filtered.data[[1]]$Cellularit/100-
          all$EM.output$normalized.centers[[2]][all$cluster]}**2
  )
  
  mean.error<-c(mean(pap.mut.err),
                    mean(ext.mut.err),
                    mean(all.mut.err))
  
  mean.driv.error<-c(mean(pap.err),
                     mean(ext.err),
                     mean(all.err))
  
  result<-data.frame(Pipeline = c("paper","extended","all"),
                     NMI = NMI,
                     Max.Distance.to.clone = Max.Distance.to.clone,
                     nclusters = nclusters,
                     mean.mut.error=mean.error,
                     mean.driv.error = mean.driv.error
  )
  return(result)
}


# Convert function
cnum<-function(x) as.numeric(as.character(x))
# MaxDistance
MaxDistance<-function(cellularit, cluster_cells){
  nclones<-length(cellularit[[1]])
  nclus<-length(cluster_cells[[1]])
  xclon<-cellularit[[1]]
  yclon<-cellularit[[2]]

  result<-matrix(nrow = nclus,ncol = nclones)
  
  for(i in 1:nclus){
    result[i,]<-sqrt((cnum(cluster_cells[[1]][i])-xclon)**2+
                       (cnum(cluster_cells[[2]][i])-yclon)**2
    )
  }
  minClustClon<-apply(X = result,MARGIN = 1,min) ### Find the clone that is closest to cluster
  return(max(minClustClon))
  
}

reproduce<-function(iterations,number_mutations,ndrivers){
  set.seed(234)
  
  for(iter in 1:iterations){
    toy.data<-QuantumCat_stringent(number_of_clones = 6,number_of_mutations = number_mutations,
                                   ploidy = "AB",depth = 100,
                                   contamination = c(0.3,0.4),min_depth = 50)
    permissive<-QuantumCat_permissive(fromQuantumCat = toy.data ,number_of_mutations = number_mutations,
                                      ploidy = "AB",depth = 100,
                                      contamination = c(0.3,0.4),max_depth = 50, min_depth = 30)
    drivers_id<-sample(1:(2*number_mutations),size = ndrivers,prob = rep(c(1/{4*number_mutations},
                                                                           3/{4*number_mutations}),
                                                                         each = number_mutations)
    )
    drivers_id<-drivers_id[order(drivers_id)]
    
    
    ext<-extended(filtered = toy.data,
                  permissive = permissive,
                  drivers_id = drivers_id)
    pap<-paper_pipeline(filtered = toy.data,
                        permissive = permissive,
                        drivers_id = drivers_id)
    
    all<-All(filtered = toy.data,
             permissive = permissive,
             drivers_id = drivers_id)
    
    Quality<-compare_qual(paper = pap,
                          extended = ext,
                          all = all,
                          drivers_id = drivers_id)
    
    if(iter == 1){
      result<-Quality
    }
    else{
      result<-rbind(result,Quality)
    }
  }
  
  return(result)
}
Any scripts or data that you put into this service are public.
QuantumClone documentation built on May 2, 2019, 3:03 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
QuantumClone
Clustering Mutations using High Throughput Sequencing (HTS) Data

tests/reproducible_testing/Rscript/reproduce_2.R
In QuantumClone: Clustering Mutations using High Throughput Sequencing (HTS) Data

Try the QuantumClone package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

QuantumClone Clustering Mutations using High Throughput Sequencing (HTS) Data

tests/reproducible_testing/Rscript/reproduce_2.R In QuantumClone: Clustering Mutations using High Throughput Sequencing (HTS) Data

Try the QuantumClone package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

QuantumClone
Clustering Mutations using High Throughput Sequencing (HTS) Data

tests/reproducible_testing/Rscript/reproduce_2.R
In QuantumClone: Clustering Mutations using High Throughput Sequencing (HTS) Data