#' Runs Ploidetect on read count and allele frequency data
#' @param all_data input data, formatted as a data.frame where each row is a genomic bin. Columns correspond to
#' somatic read counts, normal read counts, allele frequencies, window id (in the format chr_pos), window size, and GC content
#' @param normal column index of normal read depth
#' @param tumour column index of tumour read depth
#' @param avg_allele_freq column index of snp allele frequencies in the tumour
#' @param window_id column index of window id (formatted as chr_pos)
#' @param window_size column index for size of the genomic bin in bp
#' @param GC column index of GC content as a percentage
#' @param plots Logical. should plots be output?
#' @param verbose Logical. Print verbose messages?
#' @param lowest Integer from 0-2. Optional. Forces the copy number identity of the lowest fit peak in the read depth KDE
#' @param comp Integer. Forces selection of a specific comparator peak in the read depth kernel density estimate (KDE) for tumour content modeling
#' @param cndiff Integer. Forces the copy number difference between tallest peak in read depth KDE and the peak selected in comp
#' @param segmentation_threshold Float between 0 and 1. Threshold for segmentation of adjacent bins during copy number calling.
#' Default is 0.75. Smaller values will produce more segments, but may result in over-segmentation of copy number data.
#' @param CNA_call Logical. Should copy number calling be performed?
#' @param debugplots Logical. Should plots of data normalization be output?
#' @return A named list, containing three elements:
#' TC_calls: A data.frame containing all models considered for modeling tumour purity and ploidy
#' plots: a list of plots. Plot 1 is always a scatter plot where the x-axis is window size, y-axis is corrected somatic read counts, and
#' points are colored by SNP allele frequency. Plot 2 shows the KDE of the data, with all peaks shown. Next, the models in TC_calls are plotted,
#' one plot per model. Finally, if CNA_call was TRUE, copy number data is plotted, one plot per chromosome.
#' CNA_calls: a data.frame containing segmented copy number and LOH calls
#' @examples
#' ## Run Ploidetect without specifying a model
#' ploidetect(all_data)
#' ## Run Ploidetect and force a model which fits the second most common copy number as being a true integer copy number
#' ## with copy number equal to Ploidy +- 1, and the lowest common copy number being homozygous deletion
#' ploidetect(all_data, comp = 2, cndiff = 1, lowest = 0)
#' ## Run Ploidetect with the above model and also output copy number data
#' ploidetect(all_data, comp = 2, cndiff = 1, lowest = 0, CNA_call = T)
#' ## Run Ploidetect with a custom segmentation_threshold
#' ploidetect(all_data, comp = 2, cndiff = 1, lowest = 0, CNA_call = T, segmentation_threshold = 0.5)
#' @export
ploidetect <- function(all_data, normal = 2, tumour = 1, avg_allele_freq = 3, window_id = 4, window_size=5, GC = 6, plots = F, verbose = F, lowest = NA, comp=NA, cndiff=NA, segmentation_threshold = 0.75, CNA_call = F, debugPlots = F){
## Initialize plots object
plots <- list()
## Simplify_size
simplify_size = 100000
## Load centromeres
centromeres <- centromeres
## Add columns to input data
names(all_data) <- c("chr", "pos", "end", "tumour", "normal", "maf", "gc")
## Run ploidetect_preprocess
output <- ploidetect_preprocess(all_data = all_data, verbose = verbose, debugPlots = debugPlots, tumour = tumour, normal = normal, avg_allele_freq = avg_allele_freq, window_id = window_id, window_size = window_size, GC = GC, simplify = T, simplify_size = simplify_size)
## Unpack the output list from ploidetect_preprocess
maxpeak <- output$maxpeak
filtered <- output$x
highoutliers <- output$highoutliers
unaltered <- output$merged
bw = maxpeak/40
den <- density(filtered$residual, n = nrow(filtered), bw = bw)
## Normalize the density to 0->1 range
den$y <- (den$y - min(den$y))/(max(den$y) - min(den$y))
## Sanity check - see if ploidetect_preprocess resulted in a worse separation of peaks than the initial data
den2 <- density(filtered$y_raw, n = nrow(filtered), bw = bw)
den2$y <- (den2$y - min(den2$y))/(max(den2$y) - min(den2$y))
#den3 <- density(filtered$fan_correction, n = nrow(filtered), bw = bw)
#den3$y <- (den3$y - min(den3$y))/(max(den3$y) - min(den3$y))
## If ploidetect_preprocess made things worse, then revert to raw input data
decision <- which.min(c(mean(den$y), mean(den2$y)))#, mean(den3$y)))
if(decision != 1){
if(decision == 2){
filtered$residual <- filtered$y_raw - maxpeak
}
#if(decision == 3){
# filtered$residual <- filtered$fan_correction
#}
if(verbose){
}
}
## Perform peak calling
allPeaks <- peakcaller(filtered[findInterval(filtered$residual, vec = quantile(filtered$residual, probs = c(0, 0.999))) == 1,], bw, maxpeak = maxpeak)
## If we call zero peaks (due to poor bw, for example), try with half bandwidth and throw a warning
if(nrow(allPeaks) == 1){
warning("Zero peaks detected. Attempting peak calling with lower bandwidth")
bw = bw/2
allPeaks <- peakcaller(filtered[findInterval(filtered$residual, vec = quantile(filtered$residual, probs = c(0, 0.999))) == 1,], bw, maxpeak = maxpeak)
}
## Run processallpeaks
output <- ploidetect_processallpeaks(filtered, allPeaks)
## Unpack the output list from processallpeaks
filtered <- output$filtered
allPeaks <- output$allPeaks %>% filter(pos + maxpeak > 0)
nomaf <- output$nomaf
## Generate coverage plots for interpretation
filteredforplot <- filtered %>% filter(residual < max(allPeaks$pos) + maxpeak)
filteredforplot$mafflipped <- lapply(filteredforplot$mafflipped, function(x)median(unmerge_mafs(x, flip = T))) %>% unlist()
filteredforplot$residual <- filteredforplot$residual + maxpeak
plot <- ggplot(data = filteredforplot, mapping = aes_string(x = "window_size", y = "residual", color = "mafflipped")) + geom_point(size = 0.1, alpha = 0.1) +
#xlab("Window size") +
xlab("Window size of constant-coverage bins") +
#ylab("Reads mapping to bins in Somatic") +
ylab("Normalized somatic read counts") +
#ggtitle(paste0("Coverage vs normalized bin size (Surrogate for Mappability + GC bias)")) +
ggtitle(paste0("Coverage Plot for Filtered and Normalised Data")) +
scale_colour_viridis(option = "plasma", name = "Major Allele\n Frequency") +
#scale_x_continuous(limits = quantile(filtered$size, probs = c(0.05, 0.99))) +
theme_bw(base_size = 12)
plots <- c(plots, list(plot))
den <- density(filteredforplot$residual, n = nrow(filteredforplot), bw = bw)
dendf <- data.frame(x = den$x, y = den$y)
# Normalize the density to 0->1 range and plot
dendf$y <- (dendf$y - min(dendf$y))/(max(dendf$y) - min(dendf$y))
plot <- ggplot(data = dendf, mapping = aes(x = x, y = y)) + geom_line() +
xlab("Normalized somatic read counts") +
ylab("Normalized Density") +
ggtitle(paste0("Kernel Density Estimate of Count Data")) +
geom_vline(data = allPeaks, aes(xintercept = pos + maxpeak), linetype = 2) +
geom_text(data = allPeaks, aes(x = pos + maxpeak, y = allPeaks$height + 0.05, label = paste0("MAF = ", round(mainmaf, digits = 3)))) +
theme_bw()
plots <- c(plots, list(plot))
rerun = F
if(!all(is.na(c(comp, cndiff)))){
rerun=T
}
xdists <- ploidetect_roughmodels(allPeaks = allPeaks, maxpeak, verbose = verbose, rerun = rerun)
## Can't really do much else if there's only one peak in the data, so we return a message explaining this and exit
if(nrow(allPeaks) == 1){
return(list("TC_calls" = xdists, "plots" = plots, "CN_calls" = NA))
}
## Normalize allPeaks to maxPeak
allPeaks$pos <- allPeaks$pos + maxpeak
## If this is a "rerun" case, ie. if the user is specifying the model for Ploidetect to use
if(rerun){
xdists <- xdists %>% filter(Comparator_peak_rank == comp & Copy_number_difference_between_peaks == cndiff)
}
if(nrow(xdists) == 0 & rerun){
stop("You are trying to force models which do not exist! Check the model information you entered and try again")
}
TC_calls <- list()
for(i in 1:nrow(xdists)){
modelbuilder_output <- modelbuilder_iterative(xdists[i,], allPeaks = allPeaks, lowest = lowest, filtered = filtered, strict = T, get_homd = F, mode = "TC", nomaf = nomaf, rerun = rerun, maxpeak = maxpeak, bw = bw, verbose = verbose)
TC_calls <- c(TC_calls, list(modelbuilder_output$out))
plots <- c(plots, list(modelbuilder_output$outplot))
}
#TC_calls <- lapply(xdists, function(x) modelbuilder_iterative(xdists = x, allPeaks = allPeaks, lowest = NA, filtered = filtered, strict = T, get_homd = F, mode = "TC", nomaf = nomaf, rerun = rerun, maxpeak = maxpeak, bw = bw))
TC_calls <- do.call(rbind.data.frame, TC_calls)
if(nrow(TC_calls) == 0){
TC_calls <- list()
plots <- plyr::compact(plots)
for(i in 1:nrow(xdists)){
modelbuilder_output <- modelbuilder_iterative(xdists[i,], allPeaks = allPeaks, lowest = lowest, filtered = filtered, strict = F, get_homd = F, mode = "TC", nomaf = nomaf, rerun = rerun, maxpeak = maxpeak, bw = bw)
TC_calls <- c(TC_calls, list(modelbuilder_output$out))
plots <- c(plots, list(modelbuilder_output$outplot))
}
TC_calls <- do.call(rbind.data.frame, TC_calls)
}
TC_calls <- TC_calls %>% mutate("order" = 1:nrow(.)) %>% group_by(reads_per_copy, zero_copy_depth, Ploidy, tumour_purity) %>% dplyr::summarise_all(.funs = first) %>% arrange(model_error, maf_error)
if(length(TC_calls) == 1){
return(list("TC_calls" = TC_calls, "plots" = plots, "CN_calls" = NULL))
}
plots <- plyr::compact(plots)
ordering <- TC_calls$order
plots <- plots[c(1, 2, ordering + 2)]
if(!CNA_call){
return(list("TC_calls" = TC_calls, "plots" = plots, "CN_calls" = NULL))
}
best_model = xdists[which(xdists$Comparator_peak_rank == TC_calls$Comparator[1] & xdists$Copy_number_difference_between_peaks == TC_calls$CN_diff[1]),]
output <- modelbuilder_iterative(xdists = best_model[1,], allPeaks = allPeaks, lowest = TC_calls$lowest_peak_CN[1], filtered = filtered, strict = F, get_homd = F, mode = "CNA", nomaf = nomaf, rerun = rerun, maxpeak = maxpeak, bw = bw)
predictedpositions <- output$predictedpositions
matchedPeaks = output$matchedPeaks
depthdiff <- output$depthdiff
if(!all(allPeaks$npeak %in% matchedPeaks$npeak)){
subclonalpositions <- allPeaks$pos[which(!allPeaks$npeak %in% matchedPeaks$npeak)]
names(subclonalpositions) <- paste0(round(predict(lm(CN ~ pos, data.frame("pos" = predictedpositions, "CN" = as.numeric(names(predictedpositions)))), data.frame("pos" = subclonalpositions)), digits = 1))
}
CN_calls <- ploidetect_segmentator(filtered, matchedPeaks, maxpeak, predictedpositions, highoutliers, depthdiff, avg_allele_freq = avg_allele_freq, window_size = window_size, window_id = window_id, tumour = tumour, segmentation_threshold = segmentation_threshold, verbose = verbose, GC = GC, all_data = all_data)
n50_segments <- function(lengths){
lengths=sort(lengths)
sumlengths=sum(as.numeric(lengths))
currentsum=0
index=1
while(currentsum < 0.5*sumlengths){
currentsum = currentsum + lengths[index]
index=index+1
}
return(currentsum/(index-1))
}
n50_lengths <- c()
n50_lengths <- c(n50_lengths, CN_calls %>% group_by(chr, segment) %>% dplyr::summarise("length" = sum(end - pos)) %>% ungroup %>% dplyr::summarise("n50" = n50_segments(length)) %>% unlist())
#CN_calls %>% filter(chr == 20) %>% ggplot(aes(x = pos, y = raw_residual, color = segment)) + geom_point() + scale_color_viridis()
new_CN_calls <- ploidetect_fineCNAs(all_data = all_data, CNAout = CN_calls, depthdiff = depthdiff, maxpeak = maxpeak, TC = TC_calls$tumour_purity[1], ploidy = TC_calls$Ploidy[1], verbose = verbose, tumour = tumour, normal = normal, avg_allele_freq = avg_allele_freq, window_id = window_id, window_size = window_size, GC = GC, decision = decision, simpsize = simplify_size, unaltered = unaltered)
new_CN_calls <- do.call(rbind.data.frame, new_CN_calls)
n50_lengths <- c(n50_lengths, new_CN_calls %>% group_by(chr, segment) %>% dplyr::summarise("length" = sum(end - pos)) %>% ungroup %>% dplyr::summarise("n50" = n50_segments(length)) %>% unlist())
iters <- 2
target_iters <- ceiling(log2(simplify_size/median(all_data$end - all_data$pos)))
#new_CN_calls %>% ungroup %>% filter(chr == "17", CN < 10) %>% ggplot(aes(x = pos, y = raw_residual, color = CN)) + geom_point() + scale_color_viridis()
while(iters < (target_iters)){
initial_points <- nrow(new_CN_calls)
newer_CN_calls <- ploidetect_fineCNAs(all_data = all_data, CNAout = new_CN_calls, depthdiff = depthdiff, maxpeak = maxpeak, TC = TC_calls$tumour_purity[1], ploidy = TC_calls$Ploidy[1], verbose = verbose, tumour = tumour, normal = normal, avg_allele_freq = avg_allele_freq, window_id = window_id, window_size = window_size, GC = GC, decision = decision, simpsize = simplify_size/(2^(iters-1)), unaltered = unaltered)
newer_CN_calls <- do.call(rbind.data.frame, newer_CN_calls)
n50_lengths <- c(n50_lengths, newer_CN_calls %>% group_by(chr, segment) %>% dplyr::summarise("length" = sum(end - pos)) %>% ungroup %>% dplyr::summarise("n50" = n50_segments(length)) %>% unlist())
iters <- iters+1
if(nrow(new_CN_calls) == initial_points){
iters <- Inf
}
if(n50_lengths[length(n50_lengths)] < n50_lengths[length(n50_lengths) - 1]/10){
if(verbose){
print(paste0("Exited segmentation early with mean bin size ", mean(newer_CN_calls$end - newer_CN_calls$pos)))
}
iters <- Inf
}else{
new_CN_calls <- newer_CN_calls
}
}
print(n50_lengths)
#new_CN_calls %>% ungroup %>% filter(chr == 11, CN < 10) %>% ggplot(aes(x = pos, y = raw_residual, color = CN)) + geom_point() + scale_color_viridis()
if(verbose){
print("Calling LOH")
}
#CN_calls$subclonal <- F
#if(exists("subclonalpositions")){
# new_CN_calls$subclonal <- new_CN_calls$CN %in% as.numeric(names(subclonalpositions))
#}
new_CN_calls$CN <- round(new_CN_calls$CN, digits = 0)
CN_calls <- ploidetect_loh(purity = TC_calls$tumour_purity[1], CNA_object = new_CN_calls)
if(verbose){
print("Finished calling LOH")
}
CN_calls$state <- factor(CN_calls$state)
CN_palette <- c("0" = "#cc0000",
"1" = "#000066",
"2" = "#26d953",
"3" = "#609f70",
"4" = "#cccc00",
"5" = "#80804d",
"6" = "#cc6600",
"7" = "#856647",
"8" = "#cc0000"
)
CN_calls <- split(CN_calls, f = CN_calls$chr)
CNA_plot <- lapply(CN_calls, function(x){
chr = x$chr[1]
x %>% filter(end < centromeres$pos[which(centromeres$chr %in% chr)[1]] | pos > centromeres$end[which(centromeres$chr %in% chr)[2]]) %>% ggplot(aes(x = pos, y = log(raw_residual + maxpeak), color = as.character(state))) +
geom_point(size = 0.5) +
scale_color_manual(name = "State",
values = CN_palette,
labels = c("0" = "HOMD",
"1" = "CN = 1",
"2" = "CN = 2 HET",
"3" = "CN = 2 HOM",
"4" = "CN = 3 HET",
"5" = "CN = 3 HOM",
"6" = "CN = 4 HET",
"7" = "CN = 4 HOM",
"8" = "CN = 5+")) +
ylab("log(Read Depth)") +
xlab("position") +
ggtitle(paste0("Chromosome ", chr, " copy number profile")) +
theme_bw()
})
vaf_plot <- lapply(CN_calls, function(x){
chr = x$chr[1]
x %>% filter(end < centromeres$pos[which(centromeres$chr %in% chr)][1] | pos > centromeres$end[which(centromeres$chr %in% chr)][2]) %>% filter(!is.na(mafflipped)) %>% ggplot(aes(x = pos, y = mafflipped, color = as.character(state))) +
geom_point(size = 0.5) +
scale_color_manual(name = "State",
values = CN_palette,
labels = c("0" = "HOMD",
"1" = "CN = 1",
"2" = "CN = 2 HET",
"3" = "CN = 2 HOM",
"4" = "CN = 3 HET",
"5" = "CN = 3 HOM",
"6" = "CN = 4 HET",
"7" = "CN = 4 HOM",
"8" = "CN = 5+")) +
ylab("Major allele frequency") +
xlab("position") +
ggtitle(paste0("Chromosome ", chr, " allele frequency profile")) +
scale_y_continuous(limits = c(0.5, 1)) +
theme_bw()
})
cna_plots <- list()
for(i in 1:length(CNA_plot)){
cna_plots[i] <- list(plot_grid(CNA_plot[[i]], vaf_plot[[i]], align = "v", axis = "l", ncol = 1))
}
CN_calls <- do.call(rbind.data.frame, CN_calls)
plots <- c(plots, cna_plots)
return(list("TC_calls" = TC_calls, "plots" = plots, "CN_calls" = CN_calls))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.