Nothing
#' One step analysis function
#'
#' Sequentially calls a function to test all accessible cellularities for all mutations in the samples,then cluster them, and finally draws
#' phylogenetic trees based on the uncovered cellularities
#' @param SNV_list A list of dataframes (one for each sample), with as columns : (for the first column of the first sample the name of the sample),
#' the chromosome "Chr",the position of the mutation "Start", the number of reads supporting the mutation "Alt", the depth of coverage at this locus "Depth",
#' and if the output from FREEC for the samples are not associated, the genotype "Genotype".
#' @param FREEC_list list of dataframes from FREEC for each samples (usually named Sample_ratio.txt), in the same order as SNV_list
#' @param timepoints a numeric vector indicating if the samples are from different timepoints or tumors (e.g. one tumor and metastates) If NULL,
#' all samples are considered from the same tumor.
#' @param nclone_range A number or range of clusters that should be used for clustering
#' @param clone_priors List of vectors with the putated position of clones
#' @param prior_weight Numeric with the proportion mutations in each clone
#' @param preclustering The type of preclustering used for priors: "Flash","kmedoid" or NULL. NULL will generate
#' centers using uniform distribution. WARNING: overrides priors given
#' @param Initializations Number of initial conditions to be tested for the EM algorithm
#' @param simulated Should be TRUE if the data has been been generated by the QuantumCat algorithm
#' @param contamination Numeric vector describind the contamination in all samples (ranging from 0 to 1). Default is 0.
#' No longer used for clustering.
#' @param save_plot Should the plots be saved? Default is TRUE
#' @param ncores Number of cores to be used during EM algorithm
#' @param output_directory Path to output directory
#' @param epsilon Stop value: maximal admitted value of the difference in cluster position and weights
#' between two optimization steps. If NULL, will take 1/(average depth).
#' @param optim use L-BFS-G optimization from R, with exact gradient, ("default"),
#' or L-BFS-G with numerical gradient computation from optimx ("optimx"),
#' or Differential Evolution ("DEoptim"),
#' or a mixture of EM with exact center computation (if fully diploid), or "optim" if this fails ("compound").
#' Note that DEoptim is the only one that does not use EM algorithm
#' @param keep.all.models Should the function output the best model (default; FALSE), or all models tested (if set to true)
#' @param model.selection The function to minimize for the model selection: can be "AIC", "BIC", or numeric. In numeric, the function
#'uses a variant of the BIC by multiplication of the k*ln(n) factor. If >1, it will select models with lower complexity.
#' @param force.single.copy Should all mutations in overdiploid regions set to single copy? Default is FALSE
#' @keywords Clonal inference Cancer phylogeny
#' @import optimx compiler DEoptim
#' @importFrom grDevices colorRampPalette
#' @importFrom graphics par plot segments text
#' @importFrom stats aggregate dbinom frequency na.omit optim rbinom rnbinom rpois runif sd
#' @importFrom utils tail write.table
#' @export
#' @examples
#' Mutations<-QuantumClone::Input_Example
#' for(i in 1:2){
#' Mutations[[i]]<-cbind(rep(paste("Example_",i,sep=""),times=10),Mutations[[i]])
#' colnames(Mutations[[i]])[1]<-"Sample"
#' }
#' print("The data should look like this:")
#' print(head(Mutations[[1]]))
#'
#' cat("Cluster data: will try to cluster between 3 and 4 clones, with 1 maximum search each time,
#' and will use priors from preclustering.")
#' print("The genotype is provided in the list frame, and
#' there is no associated data from FREEC to get genotype from.")
#' print("The computation will run on a single CPU.")
#' Clustering_output<-QuantumClone(SNV_list = Mutations,
#' FREEC_list = NULL,contamination = c(0,0),nclone_range = 3:4,
#' clone_priors = NULL,prior_weight = NULL ,
#' Initializations = 1,preclustering = "FLASH", simulated = TRUE,
#' save_plot = TRUE,ncores=1,output_directory="Example")
#' print("The data can be accessed by Clustering_output$filtered_data")
#' print("All plots are now saved in the working directory")
#'
QuantumClone<-function(SNV_list,FREEC_list=NULL,contamination,
nclone_range=2:5,clone_priors=NULL,prior_weight=NULL,
simulated=FALSE,
save_plot=TRUE, epsilon = NULL,
Initializations=2,preclustering="FLASH",
timepoints=NULL,
ncores=1,output_directory=NULL,
model.selection = "BIC",optim = "default", keep.all.models = FALSE,
force.single.copy = FALSE){
r<-One_step_clustering(SNV_list = SNV_list,FREEC_list = FREEC_list,contamination = contamination,nclone_range = nclone_range,
clone_priors = clone_priors,prior_weight =prior_weight ,
Initializations = Initializations,preclustering = preclustering,
simulated = simulated,
save_plot = save_plot,ncores=ncores,output_directory=output_directory,
model.selection = model.selection,optim = optim, keep.all.models = keep.all.models,
force.single.copy = force.single.copy)
# t<-Tree_generation(Clone_cellularities = r$pamobject$medoids,timepoints = timepoints)
#
# if(save_plot){
# pdf(paste(as.character(SNV_list[[1]][1,1]),'trees.pdf',sep='_'))
# multiplot_trees(result_list =t,d = dim(r$pamobject$medoids)[1],cex = 0.8)
# dev.off()
# }
return(r)
}
#' Cellularity clustering
#'
#' Wrap up function that clusters cellularities. This is based on the most likely possibility for each mutation, give ints frequency and genotype.
#' @param SNV_list A list of dataframes (one for each sample), with as columns : (for the first column of the first sample the name of the sample),
#' the chromosome "Chr",the position of the mutation "Start", the number of reads supporting variant "Alt", as well as the total number of
#' reads overlapping position "Depth",
#' and if the output from FREEC for the samples are not associated, the genotype "Genotype".
#' @param FREEC_list list of dataframes from FREEC for each samples (usually named Sample_ratio.txt), in the same order as SNV_list
#' @param simulated Should be TRUE if the data has been been generated by the QuantumCat algorithm
#' @param save_plot Should the plots be saved? Default is TRUE
#' @param contamination Numeric vector describind the contamination in all samples (ranging from 0 to 1). Default is 0.
#' No longer used for clustering.
#' @param Initializations Number of initial conditions to be tested for the EM algorithm
#' @param preclustering The type of preclustering used for priors: "Flash","kmedoid" or NULL. NULL will generate
#' centers using uniform distribution. WARNING: overrides priors given
#' @param epsilon Stop value: maximal admitted value of the difference in cluster position and weights
#' between two optimization steps. If NULL, will take 1/(average depth)
#' @param ncores Number of cores to be used during EM algorithm
#' @param clone_priors List of vectors with the putated position of clones
#' @param prior_weight Numeric with the proportion mutations in each clone
#' @param nclone_range A number or range of clusters that should be used for clustering
#' @param restrict.to.AB Boolean: Should the analysis keep only sites located in A and AB sites in all samples?
#' @param output_directory Directory in which to save results
#' @param optim use L-BFS-G optimization from R ("default"), or from optimx ("optimx"), or Differential Evolution ("DEoptim")
#' @param keep.all.models Should the function output the best model (default; FALSE), or all models tested (if set to true)
#' @param model.selection The function to minimize for the model selection: can be "AIC", "BIC", or numeric.
#' In numeric, the function
#'uses a variant of the BIC by multiplication of the k*ln(n) factor.
#'If >1, it will select models with lower complexity.
#' @param force.single.copy Should all mutations in overdiploid regions set to single copy? Default is FALSE
#' @keywords Clonal inference
#' @export
#' @examples
#' Mutations<-QuantumClone::Input_Example
#' for(i in 1:2){
#' Mutations[[i]]<-cbind(rep(paste("Example_",i,sep=""),times=10),Mutations[[i]])
#' colnames(Mutations[[i]])[1]<-"Sample"
#' }
#' print("The data should look like this:")
#' print(head(Mutations[[1]]))
#'
#' cat("Cluster data: will try to cluster between 3 and 4 clones, with 1 maximum search each time,
#' and will use priors from preclustering (e.g. k-medoids on A and AB sites)")
#' print("The genotype is provided in the list frame, and
#' there is no associated data from FREEC to get genotype from.")
#' print("The computation will run on a single CPU.")
#' Clustering_output<-One_step_clustering(SNV_list = Mutations,
#' FREEC_list = NULL,contamination = c(0,0),nclone_range = 3:4,
#' clone_priors = NULL,prior_weight = NULL ,
#' Initializations = 1,preclustering = "FLASH", simulated = TRUE,
#' save_plot = TRUE,ncores=1,output_directory="Example")
#' print("The data can be accessed by Clustering_output$filtered_data")
#' print("All plots are now saved in the working directory")
#'
#'
One_step_clustering<-function(SNV_list,FREEC_list=NULL,
contamination,nclone_range=2:5,
clone_priors=NULL,prior_weight=NULL,Initializations=1,preclustering="FLASH",
simulated = FALSE, epsilon = NULL,
save_plot = TRUE,ncores=1,
restrict.to.AB = FALSE,output_directory=NULL,
model.selection = "BIC",optim = "default", keep.all.models = FALSE,
force.single.copy = FALSE){
# if(Initializations >1 && preclustering){
# message("Only 1 iteration will be run if preclustering is successful")
# }
### Checking input arguments
Sample_name<-as.character(SNV_list[[1]][1,1])
Sample_names<-lapply(SNV_list,FUN = function(z) z[1,1])
### Checking genotype input
if(is.null(FREEC_list)){
message("FREEC_list is empty. Checking that there is a genotype column in all samples...")
check<-TRUE
for(i in 1:length(SNV_list)){
if(is.null(SNV_list[[i]][,"Genotype"])){
warning(paste("Sample",i,"does not have a Genotype column"))
check<-FALSE
}
}
if(!check){
stop("See warnings for more details")
}
else{
message("Genotype is provided.")
Genotype_provided<-TRUE
}
}
else{
message("Checking that SNV_list and FREEC_list have the same number of samples...")
if(length(SNV_list)!=length(FREEC_list)){
stop("Incorrect input: number of samples different for SNV and FREEC")
}
message("Passed")
Genotype_provided<-FALSE
}
### Checking model selection
if(length(model.selection)>1){
stop("Model selection can only be one of BIC,AIC or a numeric value.")
}
else{
if(is.character(model.selection)){
if(model.selection != "BIC" && model.selection != "AIC" ){
stop("Character value for model selection can only be BIC or AIC")
}
}
else if(!is.numeric(model.selection)){
stop("Input model.selection is not numeric - and is not BIC or AIC. Please see documentation for model.selection.")
}
}
if(is.null(epsilon)){
epsilon<-1/mean(unlist(lapply(SNV_list,function(df) df$Depth)))
message(paste("epsilon set to:",epsilon))
}
### check optim:
#optims_accepted<-c("default","optimx","DEoptim","RcppDE")
optims_accepted<-c("default","optimx","DEoptim","compound")
if(length(optim)!=1){
stop("optim argument can only be of length 1")
}
else if(!(optim %in% optims_accepted)){
stop(paste("optim can only be one of:", paste(optims_accepted,collapse = ",")))
}
# Checking length of contamination vector:
if(is.list(SNV_list) && length(contamination)!=length(SNV_list)){
if(length(contamination)<length(SNV_list)){
warning("contamination and SNV_list have different lengths, will repeat contamination")
contamination<-rep(contamination, times = length(SNV_list))
}
else if(length(contamination)<length(SNV_list)){
warning("contamination and SNV_list have different lengths, will use first arguments of contamination")
contamination<-contamination[1:length(SNV_list)]
}
}
else if(is.data.frame(SNV_list) && length(contamination >1 )){
warning("length of contamination > 1 but only one sample, will use only first element")
contamination<-contamination[1]
}
### Pre-processing data
message(paste("Checking all possibilities for",Sample_name))
Cell<-From_freq_to_cell(SNV_list = SNV_list,
FREEC_list = FREEC_list,
Sample_names = Sample_names,
contamination = contamination,
Genotype_provided = Genotype_provided,
save_plot = save_plot,
restrict.to.AB = restrict.to.AB,
output_directory=output_directory,
force.single.copy = force.single.copy)
message("Starting clustering...")
r<-Cluster_plot_from_cell(Cell = Cell,nclone_range = nclone_range,
epsilon = epsilon,
Sample_names = Sample_names,
preclustering = preclustering,
clone_priors = clone_priors,
prior_weight = prior_weight,
Initializations = Initializations,
simulated = simulated,
save_plot=save_plot,
contamination = contamination,
ncores=ncores,
output_directory=output_directory,
model.selection = model.selection,
optim = optim,
keep.all.models = keep.all.models)
if(length(SNV_list)==1 & save_plot){
q<-One_D_plot(EM_out = r,contamination = contamination)
if(is.null(output_directory)){
ggplot2::ggsave(plot = q, filename = paste(Sample_name,'/', 'Density', Sample_name,'.pdf',sep=''),width = 6.04,height = 6.04)
}
else{
ggplot2::ggsave(plot = q, filename = paste(output_directory,'/', 'Density', Sample_name,'.pdf',sep=''),width = 6.04,height = 6.04)
}
}
#### New version
if(length(unique(Cell[[1]]$id))==length(Cell[[1]]$id)){ # All mutations had a single possible state
message("Post-processing output")
if(keep.all.models){
r<-lapply(r, FUN = function(z){
Tidy_output(r = z,
Genotype_provided = Genotype_provided,
SNV_list = SNV_list )
}
)
}
else{
r<-Tidy_output(r =r,
Genotype_provided = Genotype_provided,
SNV_list = SNV_list)
}
}
else{
### Selecting best state for each variant with best criterion
### Then recluster (and keep model if needed)
message("Keeping most likely state for each variant...")
### 1. Select best clusters
if(keep.all.models){
Crits<-as.numeric(as.character(unlist(lapply(r, function(z){
z$Crit
}))
)
)
r<-r[[which.min(Crits)]]
}
r<-Tidy_output(r =r,
Genotype_provided = Genotype_provided,
SNV_list = SNV_list)
message("Reclustering with most likely state of each variant...")
r<-Cluster_plot_from_cell(Cell = r$filtered.data,
nclone_range = nclone_range,
epsilon = epsilon,
Sample_names = Sample_names,
preclustering = preclustering,
clone_priors = clone_priors,
prior_weight = prior_weight,
Initializations = Initializations,
simulated = simulated,
save_plot=save_plot,
contamination = contamination,
ncores=ncores,
output_directory=output_directory,
model.selection = model.selection,
optim = optim,
keep.all.models = keep.all.models)
}
if(is.list(r$EM.output$centers)){
if(!keep.all.models){
normalized.centers<-list()
for(i in 1:length(r$EM.output$centers)){
normalized.centers[[i]]<-r$EM.output$centers[[i]]/(1-contamination[i])
}
r$EM.output$normalized.centers<-normalized.centers
}
else{
for(mod in 1:length(r)){
normalized.centers<-list()
for(i in 1:length(r$EM.output$centers)){
normalized.centers[[i]]<-r[[mod]]$EM.output$centers[[i]]/(1-contamination[i])
}
r[[mod]]$EM.output$normalized.centers<-normalized.centers
}
}
}
r
}
#' Tidying output from EM
#'
#' Tidying input by Chr Start
#' @param r output from Cluster_plot_from_cell
#' @param Genotype_provided If the FREEC_list is provided, then should be FALSE (default), otherwise TRUE
#' @param SNV_list A list of dataframes (one for each sample), with as columns : (for the first column of the first sample the name of the sample),
#'
Tidy_output<-function(r, Genotype_provided,SNV_list){
Work_input<-SNV_list[[1]]
Work_output<-r$filtered.data[[1]]
if(Genotype_provided){
commonCols<-c("Chr","Start","Depth","Alt","Genotype")
}else{
commonCols<-c("Chr","Start","Depth","Alt")
}
Keep<-apply(Work_input[,c("Chr","Start","Depth")],MARGIN = 1, function(z){
t<-apply(Work_output[,c("Chr","Start","Depth")],MARGIN = 1,function(y){
return(all(y == z))
})
if(sum(t)==0){
return(FALSE)
}
else{
return(TRUE)
}
})
Cell<-list()
for(i in 1:length(SNV_list)){
Cell[[i]]<-SNV_list[[i]][Keep,]
}
result<-r
for(i in 1:length(r$filtered.data)){
result$filtered.data[[i]]<-merge(r$filtered.data[[i]], Cell[[i]],by = commonCols)
}
### Extract new order of mutations:
Order<-apply(result$filtered.data[[1]][,c("Chr","Start","Depth")],MARGIN = 1, function(z){
t<-apply(r$filtered.data[[1]][,c("Chr","Start","Depth")],MARGIN = 1,function(y){
all(z == y)
})
return(which(t))
})
result$cluster<-result$cluster[Order]
result$EM.output$fik<-result$EM.output$fik[Order,]
return(result)
}
#' Wrap-up function
#'
#' Function that computes the most likely position for each mutation based on the genotype
#' @param SNV_list A list of dataframes (one for each sample), with as columns : (for the first column of the first sample the name of the sample),
#' the chromosome "Chr",the position of the mutation "Start", the number of reads supporting the mutation "Alt", the depth of coverage at this locus "Depth",
#' and if the output from FREEC for the samples are not associated, the genotype "Genotype".
#' @param FREEC_list list of dataframes from FREEC for each samples (usually named Sample_ratio.txt), in the same order as SNV_list
#' @param Genotype_provided If the FREEC_list is provided, then should be FALSE (default), otherwise TRUE
#' @param save_plot Should the plots be saved? Default is TRUE
#' @param Sample_names Name of the samples
#' @param contamination Numeric vector describind the contamination in all samples (ranging from 0 to 1). Default is 0.
#' No longer used for clustering.
#' @param ncores Number of cores to be used during EM algorithm
#' @param restrict.to.AB Should the analysis keep only sites located in A and AB sites in all samples?
#' @param output_directory Directory in which to save results
#' @param force.single.copy Should all mutations in overdiploid regions set to single copy? Default is FALSE
#' @keywords Clonal inference
#'
From_freq_to_cell<-function(SNV_list,FREEC_list=NULL,Sample_names,Genotype_provided=FALSE,save_plot=TRUE,
contamination,ncores = 4, restrict.to.AB = FALSE,output_directory=NULL,
force.single.copy = FALSE){
if(save_plot){
if(is.null(output_directory)){
dir.create(path = paste(Sample_names[1]), showWarnings = FALSE)
}
else{
dir.create(path=output_directory,showWarnings = FALSE)
}
}
if(Genotype_provided){
Schrod_out<-Patient_schrodinger_cellularities(SNV_list = SNV_list,Genotype_provided =TRUE,
contamination = contamination, restrict.to.AB = restrict.to.AB,
force.single.copy = force.single.copy)
}
else{
Schrod_out<-Patient_schrodinger_cellularities(SNV_list = SNV_list, FREEC_list = FREEC_list,
contamination = contamination, restrict.to.AB = restrict.to.AB,
force.single.copy = force.single.copy)
}
if(save_plot){
plot_cell_from_Return_out(Schrod_out,Sample_names,output_directory)
}
return(Schrod_out)
}
#' Patient Schrodinger Cellularities
#'
#' Computes all possible cellularities for all mutations across all samples. Calls CellularitiesFromFreq on all mutations to evaluate all possibilities
#' @param SNV_list A list of dataframes (one for each sample), with as columns : (for the first column of the first sample the name of the sample),
#' the chromosome "Chr",the position of the mutation "Start", the number of reads supporting the mutation "Alt", the depth of coverage at this locus "Depth",
#' and if the output from FREEC for the samples are not associated, the genotype "Genotype".
#' @param FREEC_list list of dataframes from FREEC for each samples (usually named Sample_ratio.txt), in the same order as SNV_list
#' @param Genotype_provided If the FREEC_list is provided, then should be FALSE (default), otherwise TRUE
#' @param contamination Numeric vector describind the contamination in all samples (ranging from 0 to 1). Default is 0.
#' No longer used for clustering.
#' @param restrict.to.AB Should the analysis keep only sites located in A and AB sites in all samples?
#' @param force.single.copy Should all mutations in overdiploid regions set to single copy? Default is FALSE
#' @keywords Clonal inference
Patient_schrodinger_cellularities<-function(SNV_list,FREEC_list=NULL,Genotype_provided=FALSE,
contamination, restrict.to.AB = FALSE,
force.single.copy = FALSE){
result<-list()
count<-0
id<-1
chr<-SNV_list[[1]][,"Chr"]
chr_ante<-0
### Compute all possible cellularities and join them
for(i in 1:nrow(SNV_list[[1]])){ ##Exploring all possibilities for each mutation
Cell<-list()
test<-T
for(k in 1:length(SNV_list)){
if(test){ ## Do not look at position if it is invalid in another sample previously explored
if(!is.null(SNV_list[[k]]$subclone.genotype)){
if(is.na(SNV_list[[k]]$subclone.genotype)){
subclone.geno<-NULL
}
}
else{
subclone.geno<-SNV_list[[k]][i,"subclone.genotype"]
}
if(Genotype_provided){
Cell[[k]]<-cbind(CellularitiesFromFreq(Genotype= as.character(SNV_list[[k]][i,'Genotype']),
Alt = SNV_list[[k]][i,"Alt"],
Depth = SNV_list[[k]][i,"Depth"],
subclone.genotype = subclone.geno,
subclone.cell = SNV_list[[k]][i,"subclone.cell"],
chr = SNV_list[[k]][i,'Chr'],
position = SNV_list[[k]][i,'Start'],
contamination = contamination[k],
restrict.to.AB = restrict.to.AB,
force.single.copy = force.single.copy),
id)
}
else{
if(chr[i]!=chr_ante){
CHR_FREEC<-lapply(FREEC_list,
function(z){
z[as.character(z[,"Chromosome"])==strsplit(as.character(chr[i]),
split = "r")[[1]][2],]
} )
chr_ante<-chr[i]
}
Cell[[k]]<-cbind(CellularitiesFromFreq(Freec_ratio = CHR_FREEC[[k]],
Alt = SNV_list[[k]][i,"Alt"],Depth = SNV_list[[k]][i,"Depth"],
subclone.genotype = subclone.geno,
subclone.cell = SNV_list[[k]][i,"subclone.cell"],
chr = SNV_list[[k]][i,'Chr'], position = SNV_list[[k]][i,'Start'],
contamination = contamination[k],
restrict.to.AB = restrict.to.AB,
force.single.copy = force.single.copy),
id)
}
if(sum(is.na(Cell[[k]]))>0){
test<-F
count<-count+1
}
}
}
if(test){ ## Checking that genotype is available
L<-list()
for(r in 1:length(Cell)){
L[[r]]<-1:(nrow(Cell[[r]]))
}
U<-expand.grid(L) ##Table with all Cell row combinations
for(k in 1:(ncol(U))){
if(id==1){
result[[k]]<-Cell[[k]][U[,k],]
}
else{
result[[k]]<-rbind(result[[k]],Cell[[k]][U[,k],])
}
}
id<-id+1
}
}
if(count>0){
warning(paste(count,'mutations excluded due to missing genotype or normalization issues'))
}
return(result)
}
#' Cellularities from allele frequency
#'
#' Creates all possibilities for one mutation in one sample (given a genotype), then computes
#' the cellularity associated with each possibility and finally the probability of each possibility
#' @param chr The chromosome on which is the position (numeric value, not chr1 as in BED files)
#' @param position The genomic position of the mutation
#' @param Freec_ratio The FREEC output associated with the sample of interest
#' @param Genotype If the FREEC output is not given, the genotype associated with the locus (for example AAB)
#' @param Alt Number of reads supporting the variation
#' @param Depth Number of reads mapped at the position
#' @param subclone.genotype If existing, the genotype of the subclone. Else NULL
#' @param subclone.cell The cellular prevalence of the subclone which has a different Copy Number at this site
#' @param contamination The fraction of normal cells in the sample
#' @param restrict.to.AB Should the analysis keep only sites located in A and AB sites in all samples?
#' @param force.single.copy Should all mutations in overdiploid regions set to single copy? Default is FALSE
#' @keywords Clonal inference
CellularitiesFromFreq<-function(chr, position,Alt,Depth,
Freec_ratio=NULL, Genotype=NULL,subclone.genotype=NULL,
subclone.cell=NULL,contamination, restrict.to.AB = FALSE,
force.single.copy = FALSE){
##For 1 mutation
if(!is.null(Freec_ratio)){
if(grepl(pattern = "chr",x = chr,ignore.case = T)){
FChr<-sapply(X = 'chr',FUN = paste, Freec_ratio[,'Chromosome'],sep='')
}
else{
FChr<-Freec_ratio[,'Chromosome']
}
Genotype<-as.character(tail(Freec_ratio[FChr==chr & Freec_ratio[,'Start']<position,'Genotype'],1))
}
if(length(Genotype)==0){
message(paste("Genotype is not available at position",chr,position))
return(NA)
}
else if(is.na(Genotype)){
message(paste("Genotype is not available at position",chr,position))
return(NA)
}
else if(Genotype==-1){
message(paste("Genotype is not available at position",chr,position))
return(NA)
}
else if(restrict.to.AB & (Genotype!='A' & Genotype!='AB')){
message(paste("Position",chr,position,"is not in an A or AB site. Genotype:",Genotype))
return(NA)
}
else if (is.null(subclone.genotype) | is.null(subclone.cell)){
As<-strcount(x = Genotype, pattern = 'A',split = '')
Ns<-nchar(Genotype)
if(force.single.copy){
### Only one possibility per mutation
result<-data.frame(Chr = chr,
Start = position,
Cellularity = as.numeric(
{Alt/Depth}*{Ns + contamination/(1-contamination)*2}
),
Genotype = Genotype,
Alt = Alt,
Depth = Depth,
NC = 1,
NCh = Ns)
}
else{
### Vectorized version:
result<-data.frame(Chr = chr,
Start = position,
Cellularity = as.numeric(
{Alt/Depth}*{Ns + contamination/(1-contamination)*2}/(1:As)
),
Genotype = Genotype,
Alt = Alt,
Depth = Depth,
NC = 1:As,
NCh = Ns)
}
}
else{ ## Two possibilities: belong to clone or subclone
result<-data.frame()
As<-strcount(x = Genotype, pattern = 'A',split = '')
Ns<-nchar(Genotype)
for(i in 1:As){
Cellularity<-as.numeric(frequency/100*{Ns+contamination/(1-contamination)*2}/i)
spare<-data.frame(chr,position,Cellularity, Genotype,Alt,Depth,i,Ns)
colnames(spare)<-c('Chr','Start','Cellularity','Genotype',"Alt","Depth","NC","NCh")
result<-rbind(result,spare)
}
A.sub<-strcount(x=subclone.genotype,pattern = "A",split = '')
N.sub<-nchar(subclone.genotype)
for(j in A.sub){
### Keep only possibilities that have a cellularity lower than the subclonal cellularity
Cellularity<-as.numeric(frequency/100*{N.sub+contamination/(1-contamination)*2}/j)
if(Cellularity<=subclone.cell){
spare<-data.frame(chr,position,Cellularity, subclone.genotype,Alt,Depth,j,N.sub)
colnames(spare)<-c('Chr','Start','Cellularity','Genotype',"Alt","Depth","NC","NCh")
result<-rbind(result,spare)
}
}
}
result<-data.frame(result)
colnames(result)<-c('Chr','Start','Cellularity','Genotype',"Alt","Depth","NC","NCh")
return(result)
}
#'String count
#'
#' Counting the number of characters for each element of a vector
#'
#' @param x The vector from which elements should be counted
#' @param pattern Pattern to be recognized. Default is ''
#' @param split Pattern used to split elements of the vector. Default is ''
#' @keywords Text handling
strcount <- function(x, pattern='', split=''){
unlist(lapply(strsplit(x, split),function(z) na.omit(length(grep(pattern, z)))))
}
#' Cellularity clustering
#'
#' Clustering cellularities based on the most likely presence of a clone, using the pamk algorithm (fpc package). Clustering can be guided by toggling manual_clustering on and/or giving a range of number of clusters.
#' @param Cell Output from Return_one_cell_by_mut, list of cellularities (one list-element per sample)
#' @param Sample_names Name of the sample
#' @param simulated Was the data generated by QuantumCat?
#' @param save_plot Should the clustering plots be saved? Default is True
#' @param contamination The fraction of normal cells in the samples
#' @param clone_priors If known a list of priors (cell prevalence) to be used in the clustering
#' @param prior_weight If known a list of priors (fraction of mutations in a clone) to be used in the clustering
#' @param nclone_range Number of clusters to look for
#' @param Initializations Maximal number of independant initial condition tests to be tried
#' @param preclustering The type of preclustering used for priors: "Flash","kmedoid" or NULL. NULL will generate
#' centers using uniform distribution.
#' @param epsilon Stop value: maximal admitted value of the difference in cluster position and weights between two optimization steps.
#' @param ncores Number of CPUs to be used
#' @param output_directory Directory in which to save results
#' @param optim use L-BFS-G optimization from R ("default"), or from optimx ("optimx"), or Differential Evolution ("DEoptim")
#' @param keep.all.models Should the function output the best model (default; FALSE), or all models tested (if set to true)
#' @param model.selection The function to minimize for the model selection: can be "AIC", "BIC", or numeric. In numeric, the function
#'uses a variant of the BIC by multiplication of the k*ln(n) factor. If >1, it will select models with lower complexity.
#' @importFrom fpc pamk
#' @importFrom ggplot2 ggsave
#' @keywords Clonal inference
#'
Cluster_plot_from_cell<-function(Cell,Sample_names,simulated,save_plot=TRUE,
contamination, clone_priors,prior_weight,nclone_range,Initializations,preclustering=TRUE,
epsilon=5*(10**(-3)),ncores = 2,output_directory=NULL,
model.selection = "BIC",optim = "default", keep.all.models = FALSE){
preclustering_success<-FALSE
preclustering_FLASH<-FALSE
##### PRECLUSTERING
###################
if(!is.null(preclustering)){
if(preclustering=="kmedoid"){
for(i in 1:length(Cell)){
if(i==1){
select<-Cell[[1]][,"Genotype"]=='A'| Cell[[1]][,"Genotype"]=='AB'
Spare<-Cell[[1]][,'Cellularity']
}
else{
select<- select & (Cell[[i]][,"Genotype"]=='A'| Cell[[i]][,"Genotype"]=='AB')
Spare<-cbind(Spare,Cell[[i]][,'Cellularity'])
}
}
if(length(Cell)==1){
Spare<-Spare[select]
if(length(Spare)==0){
warning("No A and AB sites to do preclustering")
p_clone<-clone_priors
p_weight<-prior_weight
}
}
else{
Spare<-Spare[select,] ##restriction to A AB sites
}
if(is.null(dim(Spare))){
warning("No A and AB sites to do preclustering")
p_clone<-clone_priors
p_weight<-prior_weight
}
else{
if(nrow(Spare)<=max(nclone_range)){
warning("Too few mutations to cluster. Will use priors / random initial conditions")
p_clone<-clone_priors
p_weight<-prior_weight
}
else{
Spare[Spare>1]<-1
kmeans<-fpc::pamk(Spare,krange = nclone_range,usepam = F)
p_clone<-list()
p_weight<-numeric()
create_prior_weight<- nrow(Spare)>=50
for(j in 1:(ncol(kmeans$pamobject$medoids))){
p_clone[[j]]<-as.numeric(kmeans$pamobject$medoids[,j])
if(create_prior_weight){
p_weight[j]<-sum(kmeans$pamobject$clustering==j)/length(kmeans$pamobject$clustering)
}
}
if(!create_prior_weight){
p_weight<-prior_weight
}
preclustering_success<-T
}
}
}
else{ ### "FLASH" preclustering
preclustering_FLASH<-TRUE
result<-EM_clustering(Schrod = Cell,contamination = contamination,
epsilon = epsilon,
Initializations = Initializations,
nclone_range = nclone_range,
ncores = ncores,
model.selection = model.selection,
optim = optim,
keep.all.models = keep.all.models,
FLASH = TRUE)
}
}
else{
p_clone<-clone_priors
p_weight<-prior_weight
}
#################
### Clustering with EM
#################
if(!preclustering_FLASH){
if(preclustering_success){
result<-EM_clustering(Schrod = Cell,contamination = contamination,epsilon = epsilon,
prior_weight = p_weight,clone_priors = p_clone,Initializations = Initializations,
nclone_range = nclone_range,ncores = ncores,
model.selection = model.selection,optim = optim,
keep.all.models = keep.all.models)
}
else{
result<-EM_clustering(Schrod = Cell,contamination = contamination,epsilon = epsilon,
prior_weight = p_weight,clone_priors = p_clone,Initializations = Initializations,
nclone_range = nclone_range,ncores = ncores,
model.selection = model.selection,optim = optim,keep.all.models = keep.all.models)
}
}
#### Plots possibilities
if(save_plot && length(Cell)>1){
plot_cell_from_Return_out(result$filtered.data,
Sample_names = Sample_names,
output_dir=output_directory)
}
return(result)
}
#' Probability
#'
#' Returns dataframe with all informations about mutation (Number of copies, Cellularity, etc.) and probability to belong to a clone
#' @param SNV_list A list of dataframes (one for each sample), with as columns : (for the first column of the first sample the name of the sample),
#' the chromosome "Chr",the position of the mutation "Start", the number of reads supporting the mutation "Alt", the depth of coverage at this locus "Depth",
#' and if the output from FREEC for the samples are not associated, the genotype "Genotype".
#' @param clone_prevalence List of numeric vectors giving the cellular prevalence of each clone in each sample, not normalized for contamination.
#' This should be stored in `QC_output$EM.output$centers`
#' @param contamination Numeric vector giving the contamination by normal cells
#' @param clone_weights Numeric vector giving the proportion of mutations in each clone
#' @keywords Clonal inference phylogeny
#' @export
#' @examples
#' set.seed(123)
#' SNVs<-QuantumCat(number_of_clones = 2,number_of_mutations = 50,number_of_samples = 1,ploidy = "AB")
#' Probability.to.belong.to.clone(SNV_list=SNVs,
#' clone_prevalence=list(c(0.5,1),c(0.5,1)),contamination=c(0,0))
Probability.to.belong.to.clone<-function(SNV_list,
clone_prevalence,
contamination,
clone_weights=NULL){
if(is.null(clone_weights)){
clone_weights<-rep(1/(length(clone_prevalence[[1]])),times = length(clone_prevalence[[1]]))
}
if(is.null(SNV_list[[1]]$NC)){ ### The output has not been through clustering
Schrod<-Patient_schrodinger_cellularities(SNV_list = SNV_list,Genotype_provided = TRUE,
contamination = contamination)
result<- eval.fik(Schrod = Schrod,
centers = clone_prevalence,
weights= clone_weights,keep.all.poss = TRUE,
adj.factor = Compute.adj.fact(Schrod = Schrod)
)
filtered<-filter_on_fik(Schrod = Schrod,fik = result)
filtered_prob<-Probability.to.belong.to.clone(SNV_list = filtered,clone_prevalence = clone_prevalence,
contamination = contamination,clone_weights = clone_weights)$filtered_prob
}
else{### The output has been through clustering
Schrod<-SNV_list
adj.fact<-Compute.adj.fact(SNV_list)
result<-eval.fik(Schrod = SNV_list,
centers = clone_prevalence,
weights =clone_weights,
keep.all.poss = TRUE,
adj.factor = adj.fact
)
filtered_prob<-result
filtered <-SNV_list
}
clustering<-apply(X = filtered_prob,MARGIN = 1,FUN = function(z) {
if(sum(z==max(z))>1){ ### Look for the multiple clones, and attribute with probability proportional to the weight
if(max(z)>0){
pos<-which(z==max(z))
prob<-clone_weights[pos]/(sum(clone_weights[pos]))
#sample(x = pos, size = 1, prob = prob))
return(pos[which.max(prob)])
}
else{ ### all possibilities have 0 probability, so choose one randomly
return(sample(1:length(z),size = z))
}
}
else{ # only one clone has maximal probability
return(which.max(z))
}
})
return(list(unfiltered_prob = result,
unfiltered_data = Schrod,
filtered_prob =filtered_prob,
filtered_data = filtered,
cluster = clustering)
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.