R/QuantumClone.R

Defines functions QuantumClone One_step_clustering Tidy_output From_freq_to_cell Patient_schrodinger_cellularities CellularitiesFromFreq strcount Cluster_plot_from_cell Probability.to.belong.to.clone

Documented in CellularitiesFromFreq Cluster_plot_from_cell From_freq_to_cell One_step_clustering Patient_schrodinger_cellularities Probability.to.belong.to.clone QuantumClone strcount Tidy_output

#' One step analysis function
#'
#' Sequentially calls a function to test all accessible cellularities for all mutations in the samples,then cluster them, and finally draws
#' phylogenetic trees based on the uncovered cellularities
#' @param SNV_list A list of dataframes (one for each sample), with as columns : (for the first column of the first sample the name of the sample),
#' the chromosome "Chr",the position of the mutation "Start", the number of reads supporting the mutation "Alt", the depth of coverage at this locus "Depth",
#' and if the output from FREEC for the samples are not associated, the genotype "Genotype".
#' @param FREEC_list list of dataframes from FREEC for each samples (usually named Sample_ratio.txt), in the same order as SNV_list
#' @param timepoints a numeric vector indicating if the samples are from different timepoints or tumors (e.g. one tumor and metastates) If NULL,
#' all samples are considered from the same tumor.
#' @param nclone_range A number or range of clusters that should be used for clustering
#' @param clone_priors List of vectors with the putated position of clones
#' @param prior_weight Numeric with the proportion mutations in each clone
#' @param preclustering The type of preclustering used for priors: "Flash","kmedoid" or NULL. NULL will generate
#'  centers using uniform distribution. WARNING: overrides priors given
#' @param Initializations Number of initial conditions to be tested for the EM algorithm
#' @param simulated Should be TRUE if the data has been been generated by the QuantumCat algorithm
#' @param contamination Numeric vector describind the contamination in all samples (ranging from 0 to 1). Default is 0. 
#' No longer used for clustering.
#' @param save_plot Should the plots be saved? Default is TRUE
#' @param ncores Number of cores to be used during EM algorithm
#' @param output_directory Path to output directory
#' @param epsilon Stop value: maximal admitted value of the difference in cluster position and weights 
#' between two optimization steps. If NULL, will take 1/(average depth).
#' @param optim use L-BFS-G optimization from R, with exact gradient, ("default"), 
#' or L-BFS-G with numerical gradient computation from optimx ("optimx"), 
#' or Differential Evolution ("DEoptim"),
#' or a mixture of EM with exact center computation (if fully diploid), or "optim" if this fails ("compound").
#' Note that DEoptim is the only one that does not use EM algorithm
#' @param keep.all.models Should the function output the best model (default; FALSE), or all models tested (if set to true)
#' @param model.selection The function to minimize for the model selection: can be "AIC", "BIC", or numeric. In numeric, the function
#'uses a variant of the BIC by multiplication of the k*ln(n) factor. If >1, it will select models with lower complexity.
#' @param force.single.copy Should all mutations in overdiploid regions set to single copy? Default is FALSE
#' @keywords Clonal inference Cancer phylogeny
#' @import optimx compiler DEoptim
#' @importFrom grDevices colorRampPalette
#' @importFrom graphics par plot segments text
#' @importFrom stats aggregate dbinom frequency na.omit optim rbinom rnbinom rpois runif sd
#' @importFrom utils tail write.table
#' @export
#' @examples
#' Mutations<-QuantumClone::Input_Example
#'  for(i in 1:2){
#'  Mutations[[i]]<-cbind(rep(paste("Example_",i,sep=""),times=10),Mutations[[i]])
#'  colnames(Mutations[[i]])[1]<-"Sample"
#' }
#' print("The data should look like this:")
#' print(head(Mutations[[1]]))
#'
#' cat("Cluster data: will try to cluster between 3 and 4 clones, with 1 maximum search each time,
#'       and will use priors from preclustering.")
#' print("The genotype is provided in the list frame, and
#'           there is no associated data from FREEC to get genotype from.")
#' print("The computation will run on a single CPU.")
#' Clustering_output<-QuantumClone(SNV_list = Mutations,
#' FREEC_list = NULL,contamination = c(0,0),nclone_range = 3:4,
#' clone_priors = NULL,prior_weight = NULL ,
#' Initializations = 1,preclustering = "FLASH", simulated = TRUE,
#' save_plot = TRUE,ncores=1,output_directory="Example")
#' print("The data can be accessed by Clustering_output$filtered_data")
#' print("All plots are now saved in the working directory")
#'

QuantumClone<-function(SNV_list,FREEC_list=NULL,contamination,
                       nclone_range=2:5,clone_priors=NULL,prior_weight=NULL,
                       simulated=FALSE,
                       save_plot=TRUE, epsilon = NULL,
                       Initializations=2,preclustering="FLASH",
                       timepoints=NULL,
                       ncores=1,output_directory=NULL,
                       model.selection = "BIC",optim = "default", keep.all.models = FALSE,
                       force.single.copy = FALSE){
  
  
  r<-One_step_clustering(SNV_list = SNV_list,FREEC_list = FREEC_list,contamination = contamination,nclone_range = nclone_range,
                         clone_priors = clone_priors,prior_weight =prior_weight ,
                         Initializations = Initializations,preclustering = preclustering,
                         simulated = simulated,
                         save_plot = save_plot,ncores=ncores,output_directory=output_directory,
                         model.selection = model.selection,optim = optim, keep.all.models = keep.all.models,
                         force.single.copy = force.single.copy)
  
  #   t<-Tree_generation(Clone_cellularities = r$pamobject$medoids,timepoints = timepoints)
  #
  #   if(save_plot){
  #     pdf(paste(as.character(SNV_list[[1]][1,1]),'trees.pdf',sep='_'))
  #     multiplot_trees(result_list =t,d = dim(r$pamobject$medoids)[1],cex = 0.8)
  #     dev.off()
  #   }
  return(r)
}

#' Cellularity clustering
#'
#' Wrap up function that clusters cellularities. This is based on the most likely possibility for each mutation, give ints frequency and genotype.
#' @param SNV_list A list of dataframes (one for each sample), with as columns : (for the first column of the first sample the name of the sample),
#' the chromosome "Chr",the position of the mutation "Start", the number of reads supporting variant "Alt", as well as the total number of
#' reads overlapping position "Depth",
#' and if the output from FREEC for the samples are not associated, the genotype "Genotype".
#' @param FREEC_list list of dataframes from FREEC for each samples (usually named Sample_ratio.txt), in the same order as SNV_list
#' @param simulated Should be TRUE if the data has been been generated by the QuantumCat algorithm
#' @param save_plot Should the plots be saved? Default is TRUE
#' @param contamination Numeric vector describind the contamination in all samples (ranging from 0 to 1). Default is 0. 
#' No longer used for clustering.
#' @param Initializations Number of initial conditions to be tested for the EM algorithm
#' @param preclustering The type of preclustering used for priors: "Flash","kmedoid" or NULL. NULL will generate
#'  centers using uniform distribution. WARNING: overrides priors given
#' @param epsilon Stop value: maximal admitted value of the difference in cluster position and weights 
#' between two optimization steps. If NULL, will take 1/(average depth)
#' @param ncores Number of cores to be used during EM algorithm
#' @param clone_priors List of vectors with the putated position of clones
#' @param prior_weight Numeric with the proportion mutations in each clone
#' @param nclone_range A number or range of clusters that should be used for clustering
#' @param restrict.to.AB Boolean: Should the analysis keep only sites located in A and AB sites in all samples?
#' @param output_directory Directory in which to save results
#' @param optim use L-BFS-G optimization from R ("default"), or from optimx ("optimx"), or Differential Evolution ("DEoptim")
#' @param keep.all.models Should the function output the best model (default; FALSE), or all models tested (if set to true)
#' @param model.selection The function to minimize for the model selection: can be "AIC", "BIC", or numeric. 
#' In numeric, the function
#'uses a variant of the BIC by multiplication of the k*ln(n) factor. 
#'If >1, it will select models with lower complexity.
#' @param force.single.copy Should all mutations in overdiploid regions set to single copy? Default is FALSE
#' @keywords Clonal inference
#' @export
#' @examples
#' Mutations<-QuantumClone::Input_Example
#'  for(i in 1:2){
#'  Mutations[[i]]<-cbind(rep(paste("Example_",i,sep=""),times=10),Mutations[[i]])
#'  colnames(Mutations[[i]])[1]<-"Sample"
#' }
#' print("The data should look like this:")
#' print(head(Mutations[[1]]))
#'
#' cat("Cluster data: will try to cluster between 3 and 4 clones, with 1 maximum search each time,
#'       and will use priors from preclustering (e.g. k-medoids on A and AB sites)")
#' print("The genotype is provided in the list frame, and
#'           there is no associated data from FREEC to get genotype from.")
#' print("The computation will run on a single CPU.")
#' Clustering_output<-One_step_clustering(SNV_list = Mutations,
#' FREEC_list = NULL,contamination = c(0,0),nclone_range = 3:4,
#' clone_priors = NULL,prior_weight = NULL ,
#' Initializations = 1,preclustering = "FLASH", simulated = TRUE,
#' save_plot = TRUE,ncores=1,output_directory="Example")
#' print("The data can be accessed by Clustering_output$filtered_data")
#' print("All plots are now saved in the working directory")
#'
#'
One_step_clustering<-function(SNV_list,FREEC_list=NULL,
                              contamination,nclone_range=2:5,
                              clone_priors=NULL,prior_weight=NULL,Initializations=1,preclustering="FLASH",
                              simulated = FALSE, epsilon = NULL,
                              save_plot = TRUE,ncores=1,
                              restrict.to.AB = FALSE,output_directory=NULL,
                              model.selection = "BIC",optim = "default", keep.all.models = FALSE,
                              force.single.copy = FALSE){
  # if(Initializations >1 && preclustering){
  #   message("Only 1 iteration will be run if preclustering is successful")
  # }
  
  ### Checking input arguments
  Sample_name<-as.character(SNV_list[[1]][1,1])
  Sample_names<-lapply(SNV_list,FUN = function(z) z[1,1])
  
  
  ### Checking genotype input
  if(is.null(FREEC_list)){
    message("FREEC_list is empty. Checking that there is a genotype column in all samples...")
    check<-TRUE
    for(i in 1:length(SNV_list)){
      if(is.null(SNV_list[[i]][,"Genotype"])){
        warning(paste("Sample",i,"does not have a Genotype column"))
        check<-FALSE
      }
    }
    if(!check){
      stop("See warnings for more details")
    }
    else{
      message("Genotype is provided.")
      Genotype_provided<-TRUE
    }
  }
  else{
    message("Checking that SNV_list and FREEC_list have the same number of samples...")
    if(length(SNV_list)!=length(FREEC_list)){
      stop("Incorrect input: number of samples different for SNV and FREEC")
    }
    message("Passed")
    Genotype_provided<-FALSE
  }
  
  ### Checking model selection
  if(length(model.selection)>1){
    stop("Model selection can only be one of BIC,AIC or a numeric value.")
  }
  else{
    if(is.character(model.selection)){
      if(model.selection != "BIC" && model.selection != "AIC" ){
        stop("Character value for model selection can only be BIC or AIC")
      }
    }
    else if(!is.numeric(model.selection)){
      stop("Input model.selection is not numeric - and is not BIC or AIC. Please see documentation for model.selection.")
    }
  }
  if(is.null(epsilon)){
    epsilon<-1/mean(unlist(lapply(SNV_list,function(df) df$Depth)))
    message(paste("epsilon set to:",epsilon))
  }
  ### check optim:
  #optims_accepted<-c("default","optimx","DEoptim","RcppDE")
  optims_accepted<-c("default","optimx","DEoptim","compound")
  if(length(optim)!=1){
    stop("optim argument can only be of length 1")
  }
  else if(!(optim %in% optims_accepted)){
    stop(paste("optim can only be one of:", paste(optims_accepted,collapse = ",")))
  }
  
  # Checking length of contamination vector:
  if(is.list(SNV_list) && length(contamination)!=length(SNV_list)){
    if(length(contamination)<length(SNV_list)){
      warning("contamination and SNV_list have different lengths, will repeat contamination")
      contamination<-rep(contamination, times = length(SNV_list))
    }
    else if(length(contamination)<length(SNV_list)){
      warning("contamination and SNV_list have different lengths, will use first arguments of contamination")
      contamination<-contamination[1:length(SNV_list)]
      
    }
  }
  else if(is.data.frame(SNV_list) && length(contamination >1 )){
    warning("length of contamination > 1 but only one sample, will use only first element")
    contamination<-contamination[1]
  }
  
  ### Pre-processing data
  message(paste("Checking all possibilities for",Sample_name))
  
  Cell<-From_freq_to_cell(SNV_list = SNV_list,
                          FREEC_list = FREEC_list,
                          Sample_names = Sample_names,
                          contamination = contamination,
                          Genotype_provided = Genotype_provided,
                          save_plot = save_plot,
                          restrict.to.AB = restrict.to.AB,
                          output_directory=output_directory,
                          force.single.copy = force.single.copy)
  
  message("Starting clustering...")
  r<-Cluster_plot_from_cell(Cell = Cell,nclone_range = nclone_range,
                            epsilon = epsilon,
                            Sample_names = Sample_names,
                            preclustering = preclustering,
                            clone_priors = clone_priors,
                            prior_weight = prior_weight,
                            Initializations = Initializations,
                            simulated = simulated,
                            save_plot=save_plot,
                            contamination = contamination,
                            ncores=ncores,
                            output_directory=output_directory,
                            model.selection = model.selection,
                            optim = optim,
                            keep.all.models = keep.all.models)
  
  
  if(length(SNV_list)==1 & save_plot){
    q<-One_D_plot(EM_out = r,contamination = contamination)
    if(is.null(output_directory)){
      ggplot2::ggsave(plot = q, filename = paste(Sample_name,'/', 'Density', Sample_name,'.pdf',sep=''),width = 6.04,height = 6.04)
    }
    else{
      ggplot2::ggsave(plot = q, filename = paste(output_directory,'/', 'Density', Sample_name,'.pdf',sep=''),width = 6.04,height = 6.04)
    }
  }
  
  #### New version
  
  if(length(unique(Cell[[1]]$id))==length(Cell[[1]]$id)){ # All mutations had a single possible state
    message("Post-processing output")
    if(keep.all.models){
      r<-lapply(r, FUN = function(z){
        Tidy_output(r = z,
                    Genotype_provided = Genotype_provided,
                    SNV_list = SNV_list )
      }
      )
    }
    else{
      r<-Tidy_output(r =r,
                     Genotype_provided = Genotype_provided,
                     SNV_list = SNV_list)
    }
  }
  else{ 
    ### Selecting best state for each variant with best criterion
    ### Then recluster (and keep model if needed)
    message("Keeping most likely state for each variant...")
    ### 1. Select best clusters
    if(keep.all.models){
      Crits<-as.numeric(as.character(unlist(lapply(r, function(z){
        z$Crit
      }))
      )
      )
      r<-r[[which.min(Crits)]]
    }
    r<-Tidy_output(r =r,
                   Genotype_provided = Genotype_provided,
                   SNV_list = SNV_list)
    
    message("Reclustering with most likely state of each variant...")
    r<-Cluster_plot_from_cell(Cell = r$filtered.data,
                              nclone_range = nclone_range,
                              epsilon = epsilon,
                              Sample_names = Sample_names,
                              preclustering = preclustering,
                              clone_priors = clone_priors,
                              prior_weight = prior_weight,
                              Initializations = Initializations,
                              simulated = simulated,
                              save_plot=save_plot,
                              contamination = contamination,
                              ncores=ncores,
                              output_directory=output_directory,
                              model.selection = model.selection,
                              optim = optim,
                              keep.all.models = keep.all.models)
    
  }
  
  if(is.list(r$EM.output$centers)){
    if(!keep.all.models){
      normalized.centers<-list()
      for(i in 1:length(r$EM.output$centers)){
        normalized.centers[[i]]<-r$EM.output$centers[[i]]/(1-contamination[i])
      }
      r$EM.output$normalized.centers<-normalized.centers
    }
    else{
      for(mod in 1:length(r)){
        normalized.centers<-list()
        for(i in 1:length(r$EM.output$centers)){
          normalized.centers[[i]]<-r[[mod]]$EM.output$centers[[i]]/(1-contamination[i])
        }
        r[[mod]]$EM.output$normalized.centers<-normalized.centers
      }
    }
  }
  r
}

#' Tidying output from EM
#' 
#' Tidying input by Chr Start
#' @param r output from Cluster_plot_from_cell
#' @param Genotype_provided If the FREEC_list is provided, then should be FALSE (default), otherwise TRUE
#' @param SNV_list A list of dataframes (one for each sample), with as columns : (for the first column of the first sample the name of the sample),
#' 
Tidy_output<-function(r, Genotype_provided,SNV_list){
  Work_input<-SNV_list[[1]]
  Work_output<-r$filtered.data[[1]]
  if(Genotype_provided){
    commonCols<-c("Chr","Start","Depth","Alt","Genotype")
  }else{
    commonCols<-c("Chr","Start","Depth","Alt")
  }
  Keep<-apply(Work_input[,c("Chr","Start","Depth")],MARGIN = 1, function(z){
    t<-apply(Work_output[,c("Chr","Start","Depth")],MARGIN = 1,function(y){
      return(all(y == z))
      
    })
    
    if(sum(t)==0){
      return(FALSE)
    }
    else{
      return(TRUE)
    }
  })
  Cell<-list()
  for(i in 1:length(SNV_list)){
    Cell[[i]]<-SNV_list[[i]][Keep,]
  }
  result<-r
  for(i in 1:length(r$filtered.data)){
    result$filtered.data[[i]]<-merge(r$filtered.data[[i]], Cell[[i]],by = commonCols)
  }
  
  ### Extract new order of mutations:
  Order<-apply(result$filtered.data[[1]][,c("Chr","Start","Depth")],MARGIN = 1, function(z){
    t<-apply(r$filtered.data[[1]][,c("Chr","Start","Depth")],MARGIN = 1,function(y){
      all(z == y)
    })
    
    return(which(t))
  })
  result$cluster<-result$cluster[Order]
  result$EM.output$fik<-result$EM.output$fik[Order,]
  
  return(result)
}

#' Wrap-up function
#'
#' Function that computes the most likely position for each mutation based on the genotype
#' @param SNV_list A list of dataframes (one for each sample), with as columns : (for the first column of the first sample the name of the sample),
#' the chromosome "Chr",the position of the mutation "Start", the number of reads supporting the mutation "Alt", the depth of coverage at this locus "Depth",
#' and if the output from FREEC for the samples are not associated, the genotype "Genotype".
#' @param FREEC_list list of dataframes from FREEC for each samples (usually named Sample_ratio.txt), in the same order as SNV_list
#' @param Genotype_provided If the FREEC_list is provided, then should be FALSE (default), otherwise TRUE
#' @param save_plot Should the plots be saved? Default is TRUE
#' @param Sample_names Name of the samples
#' @param contamination Numeric vector describind the contamination in all samples (ranging from 0 to 1). Default is 0. 
#' No longer used for clustering.
#' @param ncores Number of cores to be used during EM algorithm
#' @param restrict.to.AB Should the analysis keep only sites located in A and AB sites in all samples?
#' @param output_directory Directory in which to save results
#' @param force.single.copy Should all mutations in overdiploid regions set to single copy? Default is FALSE
#' @keywords Clonal inference
#'
From_freq_to_cell<-function(SNV_list,FREEC_list=NULL,Sample_names,Genotype_provided=FALSE,save_plot=TRUE,
                            contamination,ncores = 4, restrict.to.AB = FALSE,output_directory=NULL,
                            force.single.copy = FALSE){
  if(save_plot){
    if(is.null(output_directory)){
      dir.create(path = paste(Sample_names[1]), showWarnings = FALSE)
    }
    else{
      dir.create(path=output_directory,showWarnings = FALSE)
    }
  }
  if(Genotype_provided){
    Schrod_out<-Patient_schrodinger_cellularities(SNV_list = SNV_list,Genotype_provided =TRUE,
                                                  contamination = contamination, restrict.to.AB = restrict.to.AB,
                                                  force.single.copy = force.single.copy)
  }
  else{
    Schrod_out<-Patient_schrodinger_cellularities(SNV_list = SNV_list, FREEC_list = FREEC_list,
                                                  contamination = contamination, restrict.to.AB = restrict.to.AB,
                                                  force.single.copy = force.single.copy)
  }
  
  if(save_plot){
    plot_cell_from_Return_out(Schrod_out,Sample_names,output_directory)
  }
  
  return(Schrod_out)
}

#' Patient Schrodinger Cellularities
#'
#' Computes all possible cellularities for all mutations across all samples. Calls CellularitiesFromFreq on all mutations to evaluate all possibilities
#' @param SNV_list A list of dataframes (one for each sample), with as columns : (for the first column of the first sample the name of the sample),
#' the chromosome "Chr",the position of the mutation "Start", the number of reads supporting the mutation "Alt", the depth of coverage at this locus "Depth",
#' and if the output from FREEC for the samples are not associated, the genotype "Genotype".
#' @param FREEC_list list of dataframes from FREEC for each samples (usually named Sample_ratio.txt), in the same order as SNV_list
#' @param Genotype_provided If the FREEC_list is provided, then should be FALSE (default), otherwise TRUE
#' @param contamination Numeric vector describind the contamination in all samples (ranging from 0 to 1). Default is 0. 
#' No longer used for clustering.
#' @param restrict.to.AB Should the analysis keep only sites located in A and AB sites in all samples?
#' @param force.single.copy Should all mutations in overdiploid regions set to single copy? Default is FALSE
#' @keywords Clonal inference

Patient_schrodinger_cellularities<-function(SNV_list,FREEC_list=NULL,Genotype_provided=FALSE,
                                            contamination, restrict.to.AB = FALSE,
                                            force.single.copy = FALSE){
  result<-list()
  count<-0
  id<-1
  chr<-SNV_list[[1]][,"Chr"]
  chr_ante<-0
  
  ### Compute all possible cellularities and join them
  for(i in 1:nrow(SNV_list[[1]])){ ##Exploring all possibilities for each mutation
    Cell<-list()
    test<-T
    for(k in 1:length(SNV_list)){
      if(test){ ## Do not look at position if it is invalid in another sample previously explored
        if(!is.null(SNV_list[[k]]$subclone.genotype)){
          if(is.na(SNV_list[[k]]$subclone.genotype)){
            subclone.geno<-NULL
          }
        }
        else{
          subclone.geno<-SNV_list[[k]][i,"subclone.genotype"]
        }
        if(Genotype_provided){
          Cell[[k]]<-cbind(CellularitiesFromFreq(Genotype= as.character(SNV_list[[k]][i,'Genotype']),
                                                 Alt = SNV_list[[k]][i,"Alt"],
                                                 Depth = SNV_list[[k]][i,"Depth"],
                                                 subclone.genotype = subclone.geno,
                                                 subclone.cell = SNV_list[[k]][i,"subclone.cell"],
                                                 chr = SNV_list[[k]][i,'Chr'],
                                                 position = SNV_list[[k]][i,'Start'],
                                                 contamination = contamination[k],
                                                 restrict.to.AB = restrict.to.AB,
                                                 force.single.copy = force.single.copy),
                           id)
        }
        else{
          if(chr[i]!=chr_ante){
            CHR_FREEC<-lapply(FREEC_list,
                              function(z){
                                z[as.character(z[,"Chromosome"])==strsplit(as.character(chr[i]),
                                                                           split = "r")[[1]][2],]
                              } )
            chr_ante<-chr[i]
          }
          Cell[[k]]<-cbind(CellularitiesFromFreq(Freec_ratio = CHR_FREEC[[k]],
                                                 Alt = SNV_list[[k]][i,"Alt"],Depth = SNV_list[[k]][i,"Depth"],
                                                 subclone.genotype = subclone.geno,
                                                 subclone.cell = SNV_list[[k]][i,"subclone.cell"],
                                                 chr = SNV_list[[k]][i,'Chr'], position = SNV_list[[k]][i,'Start'],
                                                 contamination = contamination[k],
                                                 restrict.to.AB = restrict.to.AB,
                                                 force.single.copy = force.single.copy),
                           id)
        }
        if(sum(is.na(Cell[[k]]))>0){
          test<-F
          count<-count+1
        }
      }
    }
    if(test){ ## Checking that genotype is available
      L<-list()
      for(r in 1:length(Cell)){
        L[[r]]<-1:(nrow(Cell[[r]]))
      }
      U<-expand.grid(L) ##Table with all Cell row combinations
      for(k in 1:(ncol(U))){
        if(id==1){
          result[[k]]<-Cell[[k]][U[,k],]
        }
        else{
          result[[k]]<-rbind(result[[k]],Cell[[k]][U[,k],])
        }
      }
      id<-id+1
    }
  }
  
  if(count>0){
    warning(paste(count,'mutations excluded due to missing genotype or normalization issues'))
  }
  return(result)
}

#' Cellularities from allele frequency
#'
#' Creates all possibilities for one mutation in one sample (given a genotype), then computes
#' the cellularity associated with each possibility and finally the probability of each possibility
#' @param chr The chromosome on which is the position (numeric value, not chr1 as in BED files)
#' @param position The genomic position of the mutation
#' @param Freec_ratio The FREEC output associated with the sample of interest
#' @param Genotype If the FREEC output is not given, the genotype associated with the locus (for example AAB)
#' @param Alt Number of reads supporting the variation
#' @param Depth Number of reads mapped at the position
#' @param subclone.genotype If existing, the genotype of the subclone. Else NULL
#' @param subclone.cell The cellular prevalence of the subclone which has a different Copy Number at this site
#' @param contamination The fraction of normal cells in the sample
#' @param restrict.to.AB Should the analysis keep only sites located in A and AB sites in all samples?
#' @param force.single.copy Should all mutations in overdiploid regions set to single copy? Default is FALSE
#' @keywords Clonal inference

CellularitiesFromFreq<-function(chr, position,Alt,Depth,
                                Freec_ratio=NULL, Genotype=NULL,subclone.genotype=NULL,
                                subclone.cell=NULL,contamination, restrict.to.AB = FALSE,
                                force.single.copy = FALSE){
  ##For 1 mutation
  if(!is.null(Freec_ratio)){
    if(grepl(pattern = "chr",x = chr,ignore.case = T)){
      FChr<-sapply(X = 'chr',FUN = paste, Freec_ratio[,'Chromosome'],sep='')
    }
    else{
      FChr<-Freec_ratio[,'Chromosome']
    }
    Genotype<-as.character(tail(Freec_ratio[FChr==chr & Freec_ratio[,'Start']<position,'Genotype'],1))
  }
  if(length(Genotype)==0){
    message(paste("Genotype is not available at position",chr,position))
    return(NA)
  }
  else if(is.na(Genotype)){
    message(paste("Genotype is not available at position",chr,position))
    return(NA)
  }
  else if(Genotype==-1){
    message(paste("Genotype is not available at position",chr,position))
    return(NA)
  }
  else if(restrict.to.AB & (Genotype!='A' & Genotype!='AB')){
    message(paste("Position",chr,position,"is not in an A or AB site. Genotype:",Genotype))
    return(NA)
  }
  else if (is.null(subclone.genotype) | is.null(subclone.cell)){
    As<-strcount(x = Genotype, pattern = 'A',split = '')
    Ns<-nchar(Genotype)
    if(force.single.copy){
      ### Only one possibility per mutation
      result<-data.frame(Chr = chr,
                         Start =  position, 
                         Cellularity = as.numeric(
                             {Alt/Depth}*{Ns + contamination/(1-contamination)*2}
                             ),
                         Genotype = Genotype,
                         Alt = Alt,
                         Depth = Depth,
                         NC = 1,
                         NCh = Ns)
      
    }
    else{
      
      ### Vectorized version:
      result<-data.frame(Chr = chr,
                         Start = position,
                         Cellularity = as.numeric(
                             {Alt/Depth}*{Ns + contamination/(1-contamination)*2}/(1:As)
                         ),
                         Genotype = Genotype,
                         Alt = Alt,
                         Depth = Depth,
                         NC = 1:As,
                         NCh = Ns)
    }
  }
  else{ ## Two possibilities: belong to clone or subclone
    result<-data.frame()
    As<-strcount(x = Genotype, pattern = 'A',split = '')
    Ns<-nchar(Genotype)
    for(i in 1:As){
      Cellularity<-as.numeric(frequency/100*{Ns+contamination/(1-contamination)*2}/i)
      spare<-data.frame(chr,position,Cellularity, Genotype,Alt,Depth,i,Ns)
      colnames(spare)<-c('Chr','Start','Cellularity','Genotype',"Alt","Depth","NC","NCh")
      result<-rbind(result,spare)
    }
    A.sub<-strcount(x=subclone.genotype,pattern = "A",split = '')
    N.sub<-nchar(subclone.genotype)
    for(j in A.sub){
      ### Keep only possibilities that have a cellularity lower than the subclonal cellularity
      
      Cellularity<-as.numeric(frequency/100*{N.sub+contamination/(1-contamination)*2}/j)
      if(Cellularity<=subclone.cell){
        spare<-data.frame(chr,position,Cellularity, subclone.genotype,Alt,Depth,j,N.sub)
        colnames(spare)<-c('Chr','Start','Cellularity','Genotype',"Alt","Depth","NC","NCh")
        result<-rbind(result,spare)
      }
    }
  }
  
  result<-data.frame(result)
  colnames(result)<-c('Chr','Start','Cellularity','Genotype',"Alt","Depth","NC","NCh")
  return(result)
}

#'String count
#'
#' Counting the number of characters for each element of a vector
#'
#' @param x The vector from which elements should be counted
#' @param pattern Pattern to be recognized. Default is ''
#' @param split Pattern used to split elements of the vector. Default is ''
#' @keywords Text handling

strcount <- function(x, pattern='', split=''){
  
  unlist(lapply(strsplit(x, split),function(z) na.omit(length(grep(pattern, z)))))
}
#' Cellularity clustering
#'
#' Clustering cellularities based on  the most likely presence of a clone, using the pamk algorithm (fpc package). Clustering can be guided by toggling manual_clustering on and/or giving a range of number of clusters.
#' @param Cell Output from Return_one_cell_by_mut, list of cellularities (one list-element per sample)
#' @param Sample_names  Name of the sample
#' @param simulated Was the data generated by QuantumCat?
#' @param save_plot Should the clustering plots be saved? Default is True
#' @param contamination The fraction of normal cells in the samples
#' @param clone_priors If known a list of priors (cell prevalence) to be used in the clustering
#' @param prior_weight If known a list of priors (fraction of mutations in a clone) to be used in the clustering
#' @param nclone_range Number of clusters to look for
#' @param Initializations Maximal number of independant initial condition tests to be tried
#' @param preclustering The type of preclustering used for priors: "Flash","kmedoid" or NULL. NULL will generate
#'  centers using uniform distribution.
#' @param epsilon Stop value: maximal admitted value of the difference in cluster position and weights between two optimization steps.
#' @param ncores Number of CPUs to be used
#' @param output_directory Directory in which to save results
#' @param optim use L-BFS-G optimization from R ("default"), or from optimx ("optimx"), or Differential Evolution ("DEoptim")
#' @param keep.all.models Should the function output the best model (default; FALSE), or all models tested (if set to true)
#' @param model.selection The function to minimize for the model selection: can be "AIC", "BIC", or numeric. In numeric, the function
#'uses a variant of the BIC by multiplication of the k*ln(n) factor. If >1, it will select models with lower complexity.
#' @importFrom fpc pamk
#' @importFrom ggplot2 ggsave
#' @keywords Clonal inference
#'
Cluster_plot_from_cell<-function(Cell,Sample_names,simulated,save_plot=TRUE,
                                 contamination, clone_priors,prior_weight,nclone_range,Initializations,preclustering=TRUE,
                                 epsilon=5*(10**(-3)),ncores = 2,output_directory=NULL,
                                 model.selection = "BIC",optim = "default", keep.all.models = FALSE){
  preclustering_success<-FALSE
  preclustering_FLASH<-FALSE
  ##### PRECLUSTERING
  ###################
  if(!is.null(preclustering)){
    if(preclustering=="kmedoid"){
      for(i in 1:length(Cell)){
        if(i==1){
          select<-Cell[[1]][,"Genotype"]=='A'| Cell[[1]][,"Genotype"]=='AB'
          Spare<-Cell[[1]][,'Cellularity']
        }
        else{
          select<- select & (Cell[[i]][,"Genotype"]=='A'| Cell[[i]][,"Genotype"]=='AB')
          Spare<-cbind(Spare,Cell[[i]][,'Cellularity'])
        }
      }
      if(length(Cell)==1){
        Spare<-Spare[select]
        if(length(Spare)==0){
          warning("No A and AB sites to do preclustering")
          p_clone<-clone_priors
          p_weight<-prior_weight
        }
      }
      else{
        Spare<-Spare[select,] ##restriction to A AB sites
      }
      if(is.null(dim(Spare))){
        warning("No A and AB sites to do preclustering")
        p_clone<-clone_priors
        p_weight<-prior_weight
      }
      else{
        if(nrow(Spare)<=max(nclone_range)){
          warning("Too few mutations to cluster. Will use priors / random initial conditions")
          p_clone<-clone_priors
          p_weight<-prior_weight
        }
        else{
          Spare[Spare>1]<-1
          kmeans<-fpc::pamk(Spare,krange = nclone_range,usepam = F)
          p_clone<-list()
          p_weight<-numeric()
          create_prior_weight<- nrow(Spare)>=50
          for(j in 1:(ncol(kmeans$pamobject$medoids))){
            p_clone[[j]]<-as.numeric(kmeans$pamobject$medoids[,j])
            if(create_prior_weight){
              p_weight[j]<-sum(kmeans$pamobject$clustering==j)/length(kmeans$pamobject$clustering)
            }
          }
          if(!create_prior_weight){
            p_weight<-prior_weight
          }
          preclustering_success<-T
        }
      }
    }
    else{ ### "FLASH" preclustering
      preclustering_FLASH<-TRUE
      result<-EM_clustering(Schrod = Cell,contamination = contamination,
                            epsilon = epsilon,
                            Initializations = Initializations,
                            nclone_range = nclone_range,
                            ncores = ncores,
                            model.selection = model.selection,
                            optim = optim, 
                            keep.all.models = keep.all.models,
                            FLASH = TRUE)
    }
  }
  else{
    p_clone<-clone_priors
    p_weight<-prior_weight
  }
  #################
  ### Clustering with EM
  #################
  if(!preclustering_FLASH){
    if(preclustering_success){
      result<-EM_clustering(Schrod = Cell,contamination = contamination,epsilon = epsilon,
                            prior_weight = p_weight,clone_priors = p_clone,Initializations = Initializations,
                            nclone_range = nclone_range,ncores = ncores,
                            model.selection = model.selection,optim = optim, 
                            keep.all.models = keep.all.models)
    }
    else{
      result<-EM_clustering(Schrod = Cell,contamination = contamination,epsilon = epsilon,
                            prior_weight = p_weight,clone_priors = p_clone,Initializations = Initializations,
                            nclone_range = nclone_range,ncores = ncores,
                            model.selection = model.selection,optim = optim,keep.all.models = keep.all.models)
    }
  }
  
  #### Plots possibilities
  
  if(save_plot && length(Cell)>1){
    plot_cell_from_Return_out(result$filtered.data,
                              Sample_names = Sample_names,
                              output_dir=output_directory)
  }
  return(result)
}

#' Probability
#'
#' Returns dataframe with all informations about mutation (Number of copies, Cellularity, etc.) and probability to belong to a clone
#' @param SNV_list A list of dataframes (one for each sample), with as columns : (for the first column of the first sample the name of the sample),
#' the chromosome "Chr",the position of the mutation "Start", the number of reads supporting the mutation "Alt", the depth of coverage at this locus "Depth",
#' and if the output from FREEC for the samples are not associated, the genotype "Genotype".
#' @param clone_prevalence List of numeric vectors giving the cellular prevalence of each clone in each sample, not normalized for contamination. 
#' This should be stored in `QC_output$EM.output$centers`
#' @param contamination Numeric vector giving the contamination by normal cells
#' @param clone_weights Numeric vector giving the proportion of mutations in each clone
#' @keywords Clonal inference phylogeny
#' @export
#' @examples
#' set.seed(123)
#' SNVs<-QuantumCat(number_of_clones = 2,number_of_mutations = 50,number_of_samples = 1,ploidy = "AB")
#' Probability.to.belong.to.clone(SNV_list=SNVs,
#' clone_prevalence=list(c(0.5,1),c(0.5,1)),contamination=c(0,0))

Probability.to.belong.to.clone<-function(SNV_list,
                                         clone_prevalence,
                                         contamination,
                                         clone_weights=NULL){
  if(is.null(clone_weights)){
    clone_weights<-rep(1/(length(clone_prevalence[[1]])),times = length(clone_prevalence[[1]]))
  }
  if(is.null(SNV_list[[1]]$NC)){ ### The output has not been through clustering
    Schrod<-Patient_schrodinger_cellularities(SNV_list = SNV_list,Genotype_provided = TRUE,
                                              contamination = contamination)
    
    result<- eval.fik(Schrod = Schrod,
                      centers = clone_prevalence,
                      weights= clone_weights,keep.all.poss = TRUE,
                      adj.factor = Compute.adj.fact(Schrod = Schrod)
    )
    filtered<-filter_on_fik(Schrod = Schrod,fik = result)
    filtered_prob<-Probability.to.belong.to.clone(SNV_list = filtered,clone_prevalence = clone_prevalence,
                                                  contamination = contamination,clone_weights = clone_weights)$filtered_prob
    
  }
  else{### The output has  been through clustering
    Schrod<-SNV_list
    adj.fact<-Compute.adj.fact(SNV_list)
    result<-eval.fik(Schrod = SNV_list,
                     centers = clone_prevalence,
                     weights =clone_weights,
                     keep.all.poss = TRUE,
                     adj.factor = adj.fact
    )
    filtered_prob<-result
    filtered <-SNV_list
  }
  clustering<-apply(X = filtered_prob,MARGIN = 1,FUN = function(z) {
    if(sum(z==max(z))>1){ ### Look for the multiple clones, and attribute with probability proportional to the weight
      if(max(z)>0){
        pos<-which(z==max(z))
        prob<-clone_weights[pos]/(sum(clone_weights[pos]))
        #sample(x = pos, size = 1, prob = prob))
        return(pos[which.max(prob)])
      }
      else{ ### all possibilities have 0 probability, so choose one randomly
        return(sample(1:length(z),size = z))
      }
    }
    else{ # only one clone has maximal probability
      return(which.max(z))
    }
  })
  
  return(list(unfiltered_prob = result,
              unfiltered_data = Schrod,
              filtered_prob =filtered_prob,
              filtered_data = filtered,
              cluster = clustering)
  )
}

Try the QuantumClone package in your browser

Any scripts or data that you put into this service are public.

QuantumClone documentation built on May 2, 2019, 3:03 a.m.