R/FlashQC.R

Defines functions FLASH_main BIC_criterion_FLASH Compute_objective FlashQC MajorityVote zscore ProbDistMatrix Create_prior_cutTree Cellular_preclustering

Documented in Cellular_preclustering Create_prior_cutTree FlashQC ProbDistMatrix zscore

#'  Preclustering method
#' 
#' This method clusters mutations based on the probability that they are from the same 
#' distribution.
#' It first computes the zscore associated with a normalized number of alternative reads and depth.
#' The "normalized" number of reads is the number of alternative reads expected if the mutation was at a single 
#' copy in a diploid genome.
#' 
#' @param Schrod_cells The classic output from Schrodinger function
#' @return returns a list with:
#' \describe{
#'  \item{similarityMatrix}{ The matrix of probabilities}
#'  \item{distance}{The dissimilarity matrix}
#'  \item{tree}{The tree obtained by hierachical clustering of the dissimilarity matrix using "ward.D2" method}
#'  }
#'  
#' @importFrom stats hclust as.dist cutree hclust
Cellular_preclustering<-function(Schrod_cells){
  for(i in 1:length(Schrod_cells)){
    Schrod_cells[[i]]$Norm_Alt<-round(
      Schrod_cells[[i]]$Alt * Schrod_cells[[i]]$NCh / (2 * Schrod_cells[[i]]$NC)
    )
  }
  
  
  DistMat<-ProbDistMatrix(Schrod_cells)
  dissimMatrix<-as.dist(1-DistMat)
  tree<-hclust(d = dissimMatrix,method = "ward.D2")
  #cut_tree<-cutree(tree = tree,k = majority)
  result<-list(similarityMatrix = DistMat,
               distance = dissimMatrix,
               tree = tree
  )
  result 
}


#' Create priors from hierarchical clustering
#' 
#' Creates weights and position priors from the hierachical clustering (tree) given
#' a number of clusters Nclust. The centers of each cluster is found by
#' \eqn{Center = \frac{\sum\limits_{m \in cluster}{Normalized Alt_{m}}}{\sum\limits_{m \in cluster}{Normalized Alt_{m}}}}
#' @param tree The tree generated by Cellular_preclustering
#' @param Schrod_cells The classic output from Schrodinger function
#' @param NClus the number of clusters to cut the data
#' @param jitter Should it jitter weights and centers around values found?
#' @return returns a list with:
#' \describe{
#'  \item{weigths}{The proportion of mutations in each cluster}
#'  \item{centers}{A list with a numeric vector for each sample, with the cellularity of each cluster}
#'  }
#'  
#'  @importFrom stats cutree
Create_prior_cutTree<-function(tree,Schrod_cells,NClus,jitter = FALSE){
  for(i in 1:length(Schrod_cells)){
    Schrod_cells[[i]]$Norm_Alt<-round(
      Schrod_cells[[i]]$Alt * Schrod_cells[[i]]$NCh / (2 * Schrod_cells[[i]]$NC)
    )
  }
  clustering<-cutree(tree,k = NClus)
  
  weights<-table(clustering)/length(clustering)
  if(jitter){
    weights<-jitter(weights)
    weights[weights<0]<-0
    weights<-weights/sum(weights)
  }
  centers<-list()
  for(i in 1:length(Schrod_cells)){
    if(jitter){
    centers[[i]]<-jitter(2 * unlist(sapply(as.numeric(names(weights)),
                                    function(cl){
                                      sum(Schrod_cells[[i]]$Norm_Alt[clustering==cl])/
                                        sum(Schrod_cells[[i]]$Depth[clustering==cl])
                                    }
    )
    )
    )
    }
    else{
      centers[[i]]<-2 * unlist(sapply(as.numeric(names(weights)),
                                             function(cl){
                                               sum(Schrod_cells[[i]]$Norm_Alt[clustering==cl])/
                                                 sum(Schrod_cells[[i]]$Depth[clustering==cl])
                                             }
      )
      )
    }
    centers[[i]][centers[[i]]>1]<-1
    centers[[i]][centers[[i]]<=0]<-.Machine$double.eps
    
  }
  list(weights = weights,centers = centers)
}

#'  Distance
#' 
#' Creates a matrix of distance between two points based on the p-value to be from the same distribution
#' @param Schrod_cells The classic output from Schrodinger function, with the normalized Alt
#' @return returns a square numeric matrix
#' @importFrom stats pnorm
ProbDistMatrix<-function(Schrod_cells){
  result<-matrix(0,nrow = nrow(Schrod_cells[[1]]),ncol = nrow(Schrod_cells[[1]]))
  for(l in 1:length(Schrod_cells)){
    ### z-score should be >0
    result<-result+zscore(Depth = Schrod_cells[[l]]$Depth,
                          Alt = Schrod_cells[[l]]$Norm_Alt
    )
  }
  2*pnorm(-result**(1/2))
  
}

#'  Z-score
#' 
#' Computes the z-score of mutations being from the same distribution 
#' @param Depth a numeric vector of depth of sequencing for each variant
#' @param Alt a numeric vector of the number of reads supporting each variant
#' @return square numeric matrix
zscore<-function(Depth,Alt){
  n<-length(Depth)
  result<-matrix(ncol = n,nrow = n)
  for(i in 1:n){
    p<-(Alt+Alt[i])/(Depth + Depth[i])
    w0<-which(p==0)
    w1<-which(p>1)
    p1<-Alt[i]/Depth[i]
    p2<-Alt/Depth
    result[,i]<-((p1-p2)**2)/(p*(1-p)*(1/Depth[i]+1/Depth))
    result[w0,i]<-0
    result[w1,i]<-+Inf
    result[is.na(result[,i]),i]<-+Inf
  }
  result
}


# Majority vote
# 
# Extract majority vote from multiple indices
# @param index vector with number of clusters selected by indices
# @return Numeric value of the number of clusters to chose
MajorityVote<-function(index){
  tab<-table(index)
  test<-tab==max(tab)
  if(sum(test)>1){ # return weighted center
    return(
      round(sum(tab[test]*as.numeric(names(tab[test])))/sum(tab[test]))
      
    )
    
  }
  else{
    return(names(which.max(tab)))
  }
}

#' Flash QuantumClone
#' 
#' Fast method to find clones without filtering for multiple states
#' @param Cells Input for QuantumClone with genotype required
#' @param conta vector with contamination fraction in each sample
#' @param Nclus vector with the number of clusters to test (alternatively only min and max values)
#' @param model.selection One of "tree", "AIC", "BIC" or numeric. "tree" will use "ccc","ch" and "gap" methods from NbClust 
#' to determine the number of clusters. "BIC","AIC" or numeric values will use methods from QuantumClone.
#' @examples
#' set.seed(123)
#' #1: Cluster data
#' In<-QuantumClone::Input_Example
#' FQC<-FlashQC(In,conta = c(0,0),Nclus = 2:10)
#' 
#' #2: Get order variants by clones:
#' ord<-order(In[[1]]$Chr)
#' #3: Visualize clustering:
#' image(
#'  1:nrow(In[[1]]),
#'  1:nrow(In[[1]]),
#'  FQC$similarity[ord,ord], 
#'  xlab="", ylab="")
#' #4: add limit of real clusters:
#' abline(h = cumsum(table(In[[1]]$Chr[ord]))+1)
#' abline(v = cumsum(table(In[[1]]$Chr[ord]))+1)
#' 
#' #5: alternatively add clusters found:
#' ord<-order(FQC$cluster)
#' image(
#'  1:nrow(In[[1]]),
#'  1:nrow(In[[1]]),
#'  FQC$similarity[ord,ord], 
#'  xlab="", ylab="")
#' abline(h = cumsum(table(FQC$cluster[ord]))+1)
#' abline(v = cumsum(table(FQC$cluster[ord]))+1)
#' # Show clustering quality:
#' NMI_cutree( FQC$cluster,chr = In[[1]]$Chr)
#' @export
#' @return returns a list with:
#' \describe{
#'  \item{similarityMatrix}{ The matrix of probabilities}
#'  \item{distance}{The dissimilarity matrix}
#'  \item{tree}{The tree obtained by hierachical clustering of the dissimilarity matrix using "ward.D2" method}
#'  \item{cluster}{attributed cluster for each variant}
#'  }
#' @importFrom NbClust NbClust
#' @seealso QuantumClone
FlashQC<-function(Cells,conta,Nclus,model.selection = "tree"){
  if(is.data.frame(Cells)){
    Cells<-list(Cells)
  }
  else if(!is.list(Cells)){
    stop("Incorrect input: Cells... exiting")
  }
  
  message("Processing input data...")
  Schrod_cells<-Patient_schrodinger_cellularities(SNV_list = Cells,
                                                  contamination = conta,
                                                  Genotype_provided = TRUE)
  
  if(nrow(Schrod_cells[[1]])==nrow(Cells[[1]])){
    recluster<-FALSE
  }
  else{
    recluster<-TRUE
  }
  
  for(i in 1:length(Schrod_cells)){
    
    Schrod_cells[[i]]$Norm_Alt<-round(
      Schrod_cells[[i]]$Alt * Schrod_cells[[i]]$NCh / (2 * Schrod_cells[[i]]$NC)
    )
  }
  
  message("Clustering...")
  DistMat<-ProbDistMatrix(Schrod_cells)
  dissimMatrix<-as.dist(1-DistMat)
  tree<-hclust(d = dissimMatrix,method = "ward.D2")
  ##################################
  ### Find correct number of clusters
  ##################################
  majority<-FLASH_main(Schrod_cells = Schrod_cells,
                       model.selection = model.selection,
                       conta = conta,Nclus = Nclus,tree = tree,
                       dissimMatrix = dissimMatrix)
  cut_tree<-cutree(tree = tree,k = majority)
  priors<-Create_prior_cutTree(tree = tree,
                               Schrod_cells = Schrod_cells,
                               NClus = majority
  )
  if(recluster){
    message("Keeping most likely positions and reclustering...")
    adj.factor<-Compute.adj.fact(Schrod_cells)
    fik<-eval.fik(Schrod = Schrod_cells,
                  centers = priors$centers,
                  weights = priors$weights,
                  adj.factor = adj.factor,
                  keep.all.poss = TRUE)
    filtered.data<-filter_on_fik(Schrod_cells,fik = fik)
    
    DistMat<-ProbDistMatrix(filtered.data)
  
    dissimMatrix<-as.dist(1-DistMat)
    tree<-hclust(d = dissimMatrix,method = "ward.D2")
    majority<-FLASH_main(Schrod_cells = filtered.data,
                         model.selection = model.selection,
                         conta = conta,Nclus = Nclus,tree = tree,
                         dissimMatrix = dissimMatrix)
    
    cut_tree<-cutree(tree = tree,k = majority)
    priors<-Create_prior_cutTree(tree = tree,
                                 Schrod_cells = filtered.data,
                                 NClus = majority
    )
    result<-list(similarity = DistMat,
                 distance = dissimMatrix,
                 tree = tree,
                 cluster = cut_tree,
                 majority = majority,
                 weights = priors$weights,
                 centers = priors$centers,
                 filtered.data = filtered.data
    )
  }
  else{
    result<-list(similarity = DistMat,
                 distance = dissimMatrix,
                 tree = tree,
                 cluster = cut_tree,
                 majority = majority,
                 weights = priors$weights,
                 centers = priors$centers,
                 filtered.data = Schrod_cells
    )
  }
  
  

  result 
}

# Compute value of objective function
# 
# Compute the value of clustering based on same principles as QuantumClone EM
# @param tree Tree from hierarchical clustering
# @param nclus Number of clusters used for cutting (numeric of length 1)
# @param Schrod Output from Schrodinger cellularities
# @param conta Numeric value of contamination fraction in each sample
Compute_objective<-function(tree,nclus,Schrod,conta){
  priors<-Create_prior_cutTree(tree = tree,Schrod_cells = Schrod,NClus = nclus)

  adj.factor<-Compute.adj.fact(Schrod)
  fik<-eval.fik(Schrod = Schrod,
           centers = priors$centers,
           weights = priors$weights,
           adj.factor = adj.factor,
           keep.all.poss = TRUE
           )
  w<-which(fik == 0)
  if(length(w)){
    fik<--fik *log(fik )
    fik[w]<-0
    return(sum(fik))
  }
  else{
    return(-sum(fik *log(fik )))
  }
  
}

# Compute criterion FLASH
# 
# Computes BIC from a list of outputs of EM algorithm, then returns the position with minimal BIC
# @param Obj Numeric vector with objective function values
# @param Mut_num Number of mutations to cluster
# @param k the number of clusters (in the same order as Obj)
# @param model.selection The function to minimize for the model selection: can be "AIC", "BIC", or numeric. In numeric, the function
# @param s Number of samples
# @keywords EM clustering number
BIC_criterion_FLASH<-function(Obj,Mut_num,k ,model.selection,s){
  ### Criterion should be minimized
  # Here we assimilate EM.output$val to -ln(L) where L is the likelihood of the model
  # BIC is written -2*ln(L)+k*ln(k)
  # Generalized BIC is written -2*ln(L)+q * k*ln(k)
  # AIC is written 2*k - 2*ln(L)
  
  if(is.numeric(model.selection)){
    ### Modified BIC to relax or add constraints on model selection
    # if q > 1 adding explicative variables should explain observed values better => control overfitting
    # if q = 1 BIC
    # if q < 1 adding explicative variables is less costly
    
    
    Bic<-numeric()
    
    #k<-length(EM_out_list[[i]]$EM.output$centers[[1]])

    Bic<-2*Obj+model.selection * s * k *log(Mut_num)

    return(Bic)
  }
  else if(model.selection == "BIC"){
    Bic<-2*Obj +s* k *log(Mut_num)
    
    return(Bic)
  }
  else if(model.selection == "AIC"){
    Aic<-2*Obj+2*k*s
    
    return(Aic)
  }
}

# Flash core
# 
# Returns number of clusters based on model selection
# @param Schrod_cells Output from Schrodinger cellularities
# @param conta vector with contamination fraction in each sample
# @param Nclus vector with the number of clusters to test (alternatively only min and max values)
# @param model.selection One of "tree", "AIC", "BIC" or numeric. "tree" will use "ccc","ch" and "gap" methods from NbClust 
# to determine the number of clusters. "BIC","AIC" or numeric values will use methods from QuantumClone.
# @param tree Hierarchical tree from hclust
# @param dissimMatrix Dissimilarity matrix, required if model selection is "tree"
FLASH_main<-function(Schrod_cells,model.selection,conta,Nclus,tree = NULL,dissimMatrix=NULL){
  if(grepl(x = "tree",pattern =  model.selection)){
    ### Using NbClust
    Cells<-matrix(data = 0,ncol = length(Schrod_cells),nrow = nrow(Schrod_cells[[1]]))
    for(i in 1:length(Schrod_cells)){
      Cells[,i]<-Schrod_cells[[i]]$Alt/Schrod_cells[[i]]$Depth*
        Schrod_cells[[i]]$NCh/(Schrod_cells[[i]]$NC * 1-conta[i])
      
    }
    index<- c("ch","ccc","gap")
    method<-"ward.D2"
    if(ncol(Cells)==1){
      Cells<-cbind(Cells,jitter(Cells))
    }
    selected<-unlist(sapply(X =index,function(name){
      NbClust::NbClust(data = Cells,diss = dissimMatrix, distance =  NULL,
              method = method,index = name,
              min.nc = min(Nclus),
              max.nc = max(Nclus)
      )$Best.nc["Number_clusters"]
    })
    )
    majority<-MajorityVote(selected)
  }else{
    ### Compute objective for each cluster value
    obj<-sapply(Nclus,function(nclus){
      Compute_objective(tree = tree,
                        nclus = nclus,
                        Schrod = Schrod_cells,
                        conta = conta)
    }
    )
    ### Adapt to BIC selection
    if(is.list(Schrod_cells)){
      s<-length(Schrod_cells)
    }
    else{
      s<-1
    }
    Crit<-BIC_criterion_FLASH(Obj = obj,Mut_num = nrow(Schrod_cells[[1]]),
                              k = Nclus,
                              model.selection = model.selection,
                              s = s
    )
    majority<-Nclus[which.min(Crit)]
  }
  #tree<-hclust(d = dissimMatrix,method = method)
  
  majority
}
DeveauP/QuantumClone documentation built on Oct. 29, 2021, 8:56 a.m.