#' bedpeToRearrCatalogue
#'
#' This function converts a data frame BEDPE into a rearrangement catalogue,
#' you should pass rearrangements of only one sample, and one rearrangement for each paired-end mates.
#' The BEDPE data fram should contain the following columns: "chrom1", "start1", "end1", "chrom2", "start2", "end2" and "sample" (sample name).
#' In addition, either two columns indicating the strands of the mates, "strand1" (+ or -) and "strand2" (+ or -), or one column indicating the structural variant class, "svclass": translocation, inversion, deletion, tandem-duplication.
#' If you specify the "svclass" column, then the "strand1" and "strand2" columns will be ignored. If the "svclass" column is absent, then it will be created
#' using the convention of BrassII from the Sanger Institute pipeline: inversion when strand1 and strand2 are different, deletion when strand1 and strand2 are both +,
#' tandem-duplication when strand1 and strand2 are both -, and translocation when strand1 and strand2 are on different chromosomes.
#'
#' Please notice that the interpretation of strand1 and strand2 from your rearrangement caller may differ from the BrassII Sanger interpretation.
#' Typically, other callers may have the strand2 sign inverted with respect to what is shown by BrassII, so for example a deletion is when strand1 is + and strand2 is -,
#' instead of when both are + as in our case. To avoid confusion, double check the convention of your caller, and possibly specify the svclass column yourself to simply ignore
#' the strand1 and strand2 automated interpretation.
#'
#' Optionally, the user can provide in the bedpe two additional columns, "non-template" and "micro-homology",
#' which should contain the DNA sequence inserted ("non-template") or deleted ("micro-homology") at the breakpoints junction.
#' A dot (".") should be inserted in these columns if a DNA sequence is not available. When these two columns are available,
#' a junctions catalogue will be computed and returned. A junction catalogue contains the counts of how many clustered/unclustered
#' rearrangements have non-templated insertions or micro-homology deletions of a certain size.
#'
#' @param sv_bedpe data frame BEDPE as described above
#' @param kmin minimum number of break points in a segment to consider it a cluster. Default is 10.
#' @param PEAK.FACTOR this factor is used to calculate a threshold for the minimum average distance of breakpoints in a cluster. The threshold is given by the expected distance divided by the PEAK.FACTOR. In turn, the expected distance is the number of base pairs in a genome divided by the total number of break points. Default is 10.
#' @return returns a list with the rearrangement catalogue (rearr_catalogue) for the given sample and the annotated bedpe (annotated_bedpe) for the given sample. Also, a junctions catalogue will be returned if the non-template and micro-homology columns are provided. If clusters of rearrangements are found then the clustering regions will also be returned (clustering_regions).
#' @keywords bedpe rearrangement
#' @export
#' @examples
#' vcf_sv_file.bedpe <- "sample.bedpe"
#' sv_bedpe <- read.table(vcf_sv_file.bedpe,sep = "\t",header = TRUE,
#' stringsAsFactors = FALSE,check.names = FALSE)
#' #build a catalogue from the bedpe file
#' res <- bedpeToRearrCatalogue(sv_bedpe)
#' plotRearrSignatures(res$rearr_catalogue)
bedpeToRearrCatalogue <- function(sv_bedpe,
kmin = 10,
PEAK.FACTOR = 10){
#check that the required columns are present
required_cols <- c("chrom1", "start1", "end1", "chrom2", "start2", "end2" , "sample")
if(!length(intersect(required_cols,colnames(sv_bedpe)))==length(required_cols) & ("svclass" %in% colnames(sv_bedpe) | all(c("strand1","strand2") %in% colnames(sv_bedpe)))){
stop("[error bedpeToRearrCatalogue] missing columns in subs data frame, following columns required: chrom1, start1, end1, chrom2, start2, end2 and sample. In addition either svclass or strand1 and strand2. Check ?bedpeToRearrCatalogue for details.")
}
clusters_table <- NULL
if(nrow(sv_bedpe)>0){
# make sure that there are no multiple samples here
if(length(unique(sv_bedpe$sample))>1){
# keep only the most frequent sample
tmpsamplenames <- unique(sv_bedpe$sample)
tmpcounts <- table(sv_bedpe$sample)
pos <- which.max(tmpcounts)
samplechoice <- names(pos)
sv_bedpe <- sv_bedpe[sv_bedpe$sample==samplechoice,,drop=F]
message("[warning bedpeToRearrCatalogue] bedpeToRearrCatalogue sample column should have only one sample name, however multiple sample names were detected: ",
paste(tmpsamplenames,collapse = ", "),". Trying to fix this by using only the sample with the largest number of rearrangements (",samplechoice,"). ",
"This fix should work if there are a few rearrangements from the germline sample, so all we would do is to remove the germline sample name and variants ",
"while keeping all the tumour sample variants.")
}
}
#Annotate the bedpe if necessary
if(nrow(sv_bedpe)>0){
#check whether column svclass is present,
#if not, compute it
if (! "svclass" %in% colnames(sv_bedpe)){
if ("strand1" %in% colnames(sv_bedpe) & "strand2" %in% colnames(sv_bedpe)){
sv_bedpe <- classifyRearrangementsFromBedpe(sv_bedpe)
}else{
message("[error bedpeToRearrCatalogue] cannot classify rearrangements: svclass column missing, and cannot compute it because strand1 and strand2 are missing.")
return(NULL)
}
}
# removing SVs shorter than 1kb
all_sv_annotated <- sv_bedpe
bkdist <- abs(sv_bedpe$start2 - sv_bedpe$start1)
sv_bedpe[sv_bedpe$svclass!='translocation',"length"] <- bkdist[sv_bedpe$svclass!='translocation']
toberemoved <- sv_bedpe$svclass!='translocation' & bkdist<1e3
if(sum(toberemoved)>0){
message("[warning bedpeToRearrCatalogue] ignoring rearrangements shorter than 1kb (",sum(toberemoved)," out of ",nrow(all_sv_annotated),")")
all_sv_annotated[toberemoved,"FILTER"] <- "length<1e3"
all_sv_annotated[!toberemoved,"FILTER"] <- "PASS"
sv_bedpe <- sv_bedpe[!toberemoved,,drop=F]
}
if(nrow(sv_bedpe)>0){
# now cluster
clustering.result <- rearrangement.clustering_bedpe(sv_bedpe,
plot.path = NA,
kmin=kmin,
kmin.samples=1,
gamma.sdev=25,
PEAK.FACTOR=PEAK.FACTOR,
thresh.dist=NA)
sv_bedpe <- clustering.result$sv_bedpe
clustering_regions <- clustering.result$clustering_regions
}else{
clustering_regions <- NULL
}
}else{
all_sv_annotated <- NULL
clustering_regions <- NULL
}
#now compute the catalogue (this takes care of the 0 SVs case)
resSVcat <- prepare.rearr.catalogue_fromAnnotatedBedpe(sv_bedpe)
rearr_catalogue <- resSVcat$SV_catalogue
sv_bedpe <- resSVcat$updated_sv_bedpe
# get the junctions catalogue too (this takes care of the 0 SVs case)
junctions_catalogue <- build_junctions_catalogue(sv_bedpe)
if(!is.null(junctions_catalogue)) colnames(junctions_catalogue) <- colnames(rearr_catalogue)
if(nrow(sv_bedpe)>0){
annotated_bedpe <- sv_bedpe
}else{
annotated_bedpe <- NULL
}
# return all results
return_list <- list()
return_list$rearr_catalogue <- rearr_catalogue
return_list$junctions_catalogue <- junctions_catalogue
return_list$annotated_bedpe <- annotated_bedpe
if(all(c("non-template","micro-homology") %in% colnames(annotated_bedpe))) return_list$nImprecise <- sum(annotated_bedpe[,"non-template"]=="_" & annotated_bedpe[,"micro-homology"]=="_")
return_list$all_sv_annotated <- all_sv_annotated
if(!is.null(clustering_regions)){
if(nrow(clustering_regions)>0) return_list$clustering_regions <- clustering_regions
}
return(return_list)
}
# this is used for per-sample clustering of both single-base substitutions and rearrangement breakpoints
rearrangement.clustering_bedpe <- function(sv_bedpe,
plot.path=NA,
kmin=10,# how many points at minimum in a peak, for the pcf algorithm
kmin.samples=kmin, # how many different samples at minimum in a peak
gamma.sdev=25, #
PEAK.FACTOR=4,
thresh.dist=NA,
gamma=NA,
kmin.filter=kmin # if the pcf parameter is different from the definition of a peak
) {
#add an id to the rearrangement
sv_bedpe$id <- 1:nrow(sv_bedpe)
#functions below expect rows to be organised by chromosomes and ordered by position on the chromosome
#prepare a dataframe for the calculation
rearrs.left <- sv_bedpe[,c('chrom1','start1','sample')]
names(rearrs.left ) <- NA
rearrs.right <- sv_bedpe[,c('chrom2','start2','sample')]
names(rearrs.right ) <- NA
rearrs.cncd <- rbind(rearrs.left , rearrs.right )
colnames(rearrs.cncd) <- c('chr', 'position', 'sample')
rearrs.cncd$isLeft <- c(rep(TRUE, nrow(rearrs.left)), rep(FALSE, nrow(rearrs.left)))
rearrs.cncd$id <- c(sv_bedpe$id, sv_bedpe$id)
# sample.bps <- rearrs.cncd
#need to reorder
sample.bps <- NULL
for (chrom_i in unique(rearrs.cncd$chr)){
tmptab <- rearrs.cncd[rearrs.cncd$chr==chrom_i,,drop=FALSE]
tmptab <- tmptab[order(tmptab$position),,drop=FALSE]
sample.bps <- rbind(sample.bps,tmptab)
}
rownames(sample.bps) <- 1:nrow(sample.bps)
#run the algorithm
genome.size <- 3 * 10^9
MIN.BPS <- 10 # minimal number of breakpoints on a chromosome to do any any segmentation
logScale <- FALSE
exp.dist <-genome.size/nrow(sample.bps)
if (logScale) {
sample.bps$intermut.dist <- log10(calcIntermutDist(sample.bps, first.chrom.na=FALSE)$distPrev) # calculate the distances between the breakpoints
if (is.na(thresh.dist)) {
thresh.dist <- log10(exp.dist/PEAK.FACTOR) # calculate the threshold to call a peak
}
} else {
sample.bps$intermut.dist <- calcIntermutDist(sample.bps, first.chrom.na=FALSE)$distPrev
if (is.na(thresh.dist)) {
thresh.dist <- exp.dist/PEAK.FACTOR
}
}
if (is.na(gamma) & !is.na(gamma.sdev)) {
# compute the mean absolute deviation
sdev <- getMad(sample.bps$intermut.dist);
gamma <- gamma.sdev*sdev
}
sample.bps$is.clustered.single <- rep(FALSE, nrow(sample.bps))
all.kat.regions <- data.frame()
for (chrom in unique(sample.bps$chr)) { # loop over chromosomes
sample.bps.flag <- sample.bps$chr==chrom # breakpoints on a current chromosome
#
if (sum(sample.bps.flag )>MIN.BPS ) { # if there are enough breakpoints on a chromosome to run pcf
data.points <- sample.bps$intermut.dist[sample.bps.flag]
res = exactPcf(data.points, kmin, gamma, T)
#reorder results
sample.bps$mean.intermut.dist[sample.bps.flag] <- res$yhat
# prepare the points for pcf
subs <- data.frame(chr=sample.bps$chr[sample.bps.flag], pos=sample.bps$position[sample.bps.flag], sample=sample.bps$sample[sample.bps.flag])
kat.regions <- extract.kat.regions(res, thresh.dist, subs, doMerging=TRUE, kmin.samples=1, kmin.filter= kmin.filter) # extract peaks, this is special case as we want at least kmin samples
all.kat.regions <- rbind(all.kat.regions, kat.regions)
if (!is.null(kat.regions) && nrow( kat.regions )>0) { # if there are any kataegis regions found on this chormosome
for (k in 1:nrow(kat.regions)) {
sample.bps$is.clustered.single[which(sample.bps.flag)[ kat.regions$firstBp[k] : kat.regions$lastBp[k]]] <- TRUE # call all breakpoints as clustered
}
}
} else {
sample.bps$mean.intermut.dist[sample.bps.flag] <- mean(sample.bps$intermut.dist[sample.bps.flag])
}
}
if (!logScale) { # even if pcf was run on non-logged distances, I log the output
sample.bps$intermut.dist <- log10(sample.bps$intermut.dist)
sample.bps$mean.intermut.dist <- log10(sample.bps$mean.intermut.dist)
}
# a rearrangement is in a cluster if any of its breakpoints are
sample.bps$is.clustered <- sample.bps$is.clustered.single
sample.bps$is.clustered[sample.bps$id %in% subset(sample.bps, is.clustered.single==TRUE)$id] <- TRUE
# mark both breakpoints of a rearrangement as clustered if any is
sv_bedpe$is.clustered <- sv_bedpe$id %in% sample.bps$id[sample.bps$is.clustered]
result <- list()
result$sv_bedpe <- sv_bedpe
result$clustering_regions <- all.kat.regions
result
}
prepare.rearr.catalogue_fromAnnotatedBedpe <- function(sv_bedpe) {
catalogue.labels <- c('clustered_del_1-10Kb', 'clustered_del_10-100Kb', 'clustered_del_100Kb-1Mb', 'clustered_del_1Mb-10Mb', 'clustered_del_>10Mb', 'clustered_tds_1-10Kb', 'clustered_tds_10-100Kb', 'clustered_tds_100Kb-1Mb', 'clustered_tds_1Mb-10Mb', 'clustered_tds_>10Mb', 'clustered_inv_1-10Kb', 'clustered_inv_10-100Kb', 'clustered_inv_100Kb-1Mb', 'clustered_inv_1Mb-10Mb', 'clustered_inv_>10Mb', 'clustered_trans', 'non-clustered_del_1-10Kb', 'non-clustered_del_10-100Kb', 'non-clustered_del_100Kb-1Mb', 'non-clustered_del_1Mb-10Mb', 'non-clustered_del_>10Mb', 'non-clustered_tds_1-10Kb', 'non-clustered_tds_10-100Kb', 'non-clustered_tds_100Kb-1Mb', 'non-clustered_tds_1Mb-10Mb', 'non-clustered_tds_>10Mb', 'non-clustered_inv_1-10Kb', 'non-clustered_inv_10-100Kb', 'non-clustered_inv_100Kb-1Mb', 'non-clustered_inv_1Mb-10Mb', 'non-clustered_inv_>10Mb', 'non-clustered_trans')
all_catalogues <- as.data.frame(matrix(nrow = length(catalogue.labels),ncol = 0))
rownames(all_catalogues) <- catalogue.labels
updated_sv_bedpe <- NULL
if (nrow(sv_bedpe)>0){
for (sample_name in unique(sv_bedpe$sample)){
sample.rearrs <- sv_bedpe[sv_bedpe$sample==sample_name,]
rearr_catalogue <- as.data.frame(matrix(0,nrow = length(catalogue.labels),ncol = 1))
if (nrow(sample.rearrs)>0) {
label1 <- rep('non-clustered', nrow(sample.rearrs))
label1[ sample.rearrs$is.clustered] <- 'clustered'
label2 <- rep('', nrow(sample.rearrs))
label2[ sample.rearrs$svclass=='deletion'] <- '_del'
label2[ sample.rearrs$svclass=='translocation'] <- '_trans'
label2[ sample.rearrs$svclass=='inversion'] <- '_inv'
label2[ sample.rearrs$svclass=='tandem-duplication'] <- '_tds'
label3 <- rep('', nrow(sample.rearrs))
sample.rearrs$bkdist <- abs(sample.rearrs$start2 - sample.rearrs$start1)
label3[ sample.rearrs$svclass!='translocation' & sample.rearrs$bkdist<=1e4] <- '_1-10Kb'
label3[ sample.rearrs$svclass!='translocation' & sample.rearrs$bkdist>1e4 & sample.rearrs$bkdist<=1e5 ] <- '_10-100Kb'
label3[ sample.rearrs$svclass!='translocation' & sample.rearrs$bkdist>1e5 & sample.rearrs$bkdist<=1e6 ] <- '_100Kb-1Mb'
label3[ sample.rearrs$svclass!='translocation' & sample.rearrs$bkdist>1e6 & sample.rearrs$bkdist<=1e7 ] <- '_1Mb-10Mb'
label3[ sample.rearrs$svclass!='translocation' & sample.rearrs$bkdist>1e7 ] <- '_>10Mb'
sample.rearrs$catalogue.label <- paste0(label1, label2, label3)
updated_sv_bedpe <- rbind(updated_sv_bedpe,sample.rearrs)
sample.table <- as.data.frame(table(sample.rearrs$catalogue.label),drop=FALSE)
rownames(sample.table) <- sample.table$Var
rearr_catalogue <- sample.table[as.character(catalogue.labels), 'Freq',drop=FALSE ]
}
rearr.catalogue <- rearr_catalogue
rownames(rearr.catalogue) <- catalogue.labels
colnames(rearr.catalogue) <- sample_name
rearr.catalogue[is.na(rearr.catalogue)] <- 0
all_catalogues <- cbind(all_catalogues,rearr.catalogue)
}
}else{
all_catalogues <- as.data.frame(matrix(0,nrow = length(catalogue.labels),ncol = 1))
rownames(all_catalogues) <- catalogue.labels
updated_sv_bedpe <- sv_bedpe
}
returnObj <- list()
returnObj$SV_catalogue <- all_catalogues
returnObj$updated_sv_bedpe <- updated_sv_bedpe
return(returnObj)
}
classifyRearrangementsFromBedpe <- function(sv_bedpe){
svclass <- c()
for (i in 1:nrow(sv_bedpe)){
if(sv_bedpe[i,"chrom1"]!=sv_bedpe[i,"chrom2"]){
svclass <- c(svclass,"translocation")
}else if(sv_bedpe[i,"strand1"]!=sv_bedpe[i,"strand2"]){
svclass <- c(svclass,"inversion")
}else if(sv_bedpe[i,"strand1"]=="+"){
svclass <- c(svclass,"deletion")
}else if(sv_bedpe[i,"strand1"]=="-"){
svclass <- c(svclass,"tandem-duplication")
}
}
sv_bedpe[,"svclass"] <- svclass
#return updated df
sv_bedpe
}
build_junctions_catalogue <- function(annotated_bedpe){
sizes <- c("_1-3",
"_4-10",
"_>10")
junctions_catalogue_channels <- paste(rep(c("clustered","non-clustered"),each=7),
rep(c(rep("_non-templated",3),"_other",rep("_homologous",3)),2),
rep(c(rev(sizes),"",sizes),2),sep = "")
junctions_catalogue <- data.frame(sample=rep(0,length(junctions_catalogue_channels)),
row.names = junctions_catalogue_channels,
stringsAsFactors = F)
if(nrow(annotated_bedpe)==0){
# cut short here returning a 0 catalogue if there are no SVs
return(junctions_catalogue)
}
# # check if non-template and micro-homology columns are present
if(all(c("non-template","micro-homology") %in% colnames(annotated_bedpe))){
for (clustered in c(TRUE,FALSE)){
# clustered <- F
channelc <- ifelse(clustered,"clustered","non-clustered")
for (typebp in c("_non-templated","_homologous","_other")){
# typebp <- "_non-templated"
channel <- paste0(channelc,typebp)
if(typebp %in% c("_non-templated","_homologous")){
bedpecol <- ifelse(typebp=="_non-templated","non-template","micro-homology")
currentseqlength <- nchar(annotated_bedpe[annotated_bedpe[,bedpecol]!="." & annotated_bedpe[,bedpecol]!="_" & annotated_bedpe$is.clustered==clustered,bedpecol])
if(length(currentseqlength)>0){
currentlengthtable <- table(currentseqlength)
for(i in 1:length(currentlengthtable)){
# i <- 1
tmpn <- as.numeric(names(currentlengthtable))[i]
if(tmpn>0 & tmpn<=3){
junctions_catalogue[paste0(channel,"_1-3"),"sample"] <- junctions_catalogue[paste0(channel,"_1-3"),"sample"] + currentlengthtable[i]
}else if(tmpn>3 & tmpn<=10){
junctions_catalogue[paste0(channel,"_4-10"),"sample"] <- junctions_catalogue[paste0(channel,"_4-10"),"sample"] + currentlengthtable[i]
}else if(tmpn>10){
junctions_catalogue[paste0(channel,"_>10"),"sample"] <- junctions_catalogue[paste0(channel,"_>10"),"sample"] + currentlengthtable[i]
}
}
}
}else{
tmpn <- sum((annotated_bedpe[,"non-template"]=="." & annotated_bedpe[,"micro-homology"]==".") & annotated_bedpe$is.clustered==clustered)
junctions_catalogue[channel,"sample"] <- junctions_catalogue[channel,"sample"] + tmpn
}
}
}
return(junctions_catalogue)
}else{
# return NULL if there are SVs but no "non-template" and "micro-homology" columns
return(NULL)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.