#' bedpeToRearrCatalogue
#'
#' This function converts a data frame BEDPE into a rearrangement catalogue,
#' you should pass rearrangements of only one sample, and one rearrangement for each paired-end mates.
#' The BEDPE data fram should contain the following columns: "chrom1", "start1", "end1", "chrom2", "start2", "end2" and "sample" (sample name).
#' In addition, either two columns indicating the strands of the mates, "strand1" (+ or -) and "strand2" (+ or -), or one column indicating the structural variant class, "svclass": translocation, inversion, deletion, tandem-duplication.
#'
#' The column "svclass" should correspon to (Sanger BRASS convention): (strand1/strand2)
#'
#' inversion (+/-), if mates on the same chromosome
#'
#' inversion (-/+), if mates on the same chromosome
#'
#' deletion (+/+), if mates on the same chromosome
#'
#' tandem-duplication (-/-), if mates on the same chromosome
#'
#' translocation, if mates are on different chromosomes
#'
#' @param sv_bedpe data frame BEDPE as described above
#' @return returns a list with 1) the rearrangement catalogue for the given sample and 2) the annotated bedpe for the given sample
#' @keywords bedpe rearrangement
#' @export
#' @examples
#' vcf_sv_file.bedpe <- "sample.bedpe"
#' sv_bedpe <- read.table(vcf_sv_file.bedpe,sep = "\t",header = TRUE,
#' stringsAsFactors = FALSE,check.names = FALSE)
#' #build a catalogue from the bedpe file
#' res.cat <- bedpeToRearrCatalogue(sv_bedpe)
bedpeToRearrCatalogue <- function(sv_bedpe){
#check that the required columns are present
required_cols <- c("chrom1", "start1", "end1", "chrom2", "start2", "end2" , "sample")
if(!length(intersect(required_cols,colnames(sv_bedpe)))==length(required_cols) & ("svclass" %in% colnames(sv_bedpe) | all(c("strand1","strand2") %in% colnames(sv_bedpe)))){
stop("[error bedpeToRearrCatalogue] missing columns in subs data frame, following columns required: chrom1, start1, end1, chrom2, start2, end2 and sample. In addition either svclass or strand1 and strand2. Check ?bedpeToRearrCatalogue for details.")
}
#Annotate the bedpe if necessary
if(nrow(sv_bedpe)>0){
#check whether column is.clustered is present,
#if not, compute it
if (! "is.clustered" %in% colnames(sv_bedpe)){
clustering.result <- rearrangement.clustering_bedpe(sv_bedpe,
plot.path = NA,
kmin=10,
kmin.samples=1,
gamma.sdev=25,
PEAK.FACTOR=10,
thresh.dist=NA)
sv_bedpe <- clustering.result$sv_bedpe
}
#check whether column svclass is present,
#if not, compute it
if (! "svclass" %in% colnames(sv_bedpe)){
if ("strand1" %in% colnames(sv_bedpe) & "strand2" %in% colnames(sv_bedpe)){
sv_bedpe <- classifyRearrangementsFromBedpe(sv_bedpe)
}else{
message("cannot classify rearrangements: svclass column missing, and cannot compute it because strand1 and strand2 are missing.")
}
}
#now compute the catalogue
rearr_catalogue <- prepare.rearr.catalogue_fromAnnotatedBedpe(sv_bedpe)
}else{
#case in which the sv_bedpe table has 0 rows
rearr_catalogue <- prepare.rearr.catalogue_fromAnnotatedBedpe(sv_bedpe)
}
return_list <- list()
return_list$rearr_catalogue <- rearr_catalogue
return_list$annotated_bedpe <- sv_bedpe
return(return_list)
}
# source('../lib/pcf/fastPCF.R')
# source('../lib/pcf/extract.kat.regions.R')
# source('../lib/utils/calcIntermutDist.R')
# this is used for per-sample clustering of both single-base substitutions and rearrangement breakpoints
rearrangement.clustering_bedpe <- function(sv_bedpe,
plot.path=NA,
kmin=10,# how many points at minimum in a peak, for the pcf algorithm
kmin.samples=kmin, # how many different samples at minimum in a peak
gamma.sdev=25, #
PEAK.FACTOR=4,
thresh.dist=NA,
gamma=NA,
kmin.filter=kmin # if the pcf parameter is different from the definition of a peak
) {
#add an id to the rearrangement
sv_bedpe$id <- 1:nrow(sv_bedpe)
#functions below expect rows to be organised by chromosomes and ordered by position on the chromosome
#prepare a dataframe for the calculation
rearrs.left <- sv_bedpe[,c('chrom1','start1','sample')]
names(rearrs.left ) <- NA
rearrs.right <- sv_bedpe[,c('chrom2','start2','sample')]
names(rearrs.right ) <- NA
rearrs.cncd <- rbind(rearrs.left , rearrs.right )
colnames(rearrs.cncd) <- c('chr', 'position', 'sample')
rearrs.cncd$isLeft <- c(rep(TRUE, nrow(rearrs.left)), rep(FALSE, nrow(rearrs.left)))
rearrs.cncd$id <- c(sv_bedpe$id, sv_bedpe$id)
# sample.bps <- rearrs.cncd
#need to reorder
sample.bps <- NULL
for (chrom_i in unique(rearrs.cncd$chr)){
tmptab <- rearrs.cncd[rearrs.cncd$chr==chrom_i,,drop=FALSE]
tmptab <- tmptab[order(tmptab$position),,drop=FALSE]
sample.bps <- rbind(sample.bps,tmptab)
}
rownames(sample.bps) <- 1:nrow(sample.bps)
#run the algorithm
genome.size <- 3 * 10^9
MIN.BPS <- 10 # minimal number of breakpoints on a chromosome to do any any segmentation
logScale <- FALSE
exp.dist <-genome.size/nrow(sample.bps)
if (logScale) {
sample.bps$intermut.dist <- log10(calcIntermutDist(sample.bps, first.chrom.na=FALSE)$distPrev) # calculate the distances between the breakpoints
if (is.na(thresh.dist)) {
thresh.dist <- log10(exp.dist/PEAK.FACTOR) # calculate the threshold to call a peak
}
} else {
sample.bps$intermut.dist <- calcIntermutDist(sample.bps, first.chrom.na=FALSE)$distPrev
if (is.na(thresh.dist)) {
thresh.dist <- exp.dist/PEAK.FACTOR
}
}
if (is.na(gamma) & !is.na(gamma.sdev)) {
# compute the mean absolute deviation
sdev <- getMad(sample.bps$intermut.dist);
gamma <- gamma.sdev*sdev
}
sample.bps$is.clustered.single <- rep(FALSE, nrow(sample.bps))
all.kat.regions <- data.frame()
for (chrom in unique(sample.bps$chr)) { # loop over chromosomes
sample.bps.flag <- sample.bps$chr==chrom # breakpoints on a current chromosome
# sample.bps.chrom <- sample.bps[sample.bps.flag,]
# sample.bps.chrom <- sample.bps.chrom[order(sample.bps.chrom$position),]
#
if (sum(sample.bps.flag )>MIN.BPS ) { # if there are enough breakpoints on a chromosome to run pcf
data.points <- sample.bps$intermut.dist[sample.bps.flag]
# data.points <- sample.bps.chrom$intermut.dist
res = exactPcf(data.points, kmin, gamma, T)
#reorder results
sample.bps$mean.intermut.dist[sample.bps.flag] <- res$yhat
# prepare the points for pcf
subs <- data.frame(chr=sample.bps$chr[sample.bps.flag], pos=sample.bps$position[sample.bps.flag], sample=sample.bps$sample[sample.bps.flag])
kat.regions <- extract.kat.regions(res, thresh.dist, subs, doMerging=TRUE, kmin.samples=1, kmin.filter= kmin.filter) # extract peaks, this is special case as we want at least kmin samples
all.kat.regions <- rbind(all.kat.regions, kat.regions)
if (!is.null(kat.regions) && nrow( kat.regions )>0) { # if there are any kataegis regions found on this chormosome
for (k in 1:nrow(kat.regions)) {
sample.bps$is.clustered.single[which(sample.bps.flag)[ kat.regions$firstBp[k] : kat.regions$lastBp[k]]] <- TRUE # call all breakpoints as clustered
}
}
} else {
sample.bps$mean.intermut.dist[sample.bps.flag] <- mean(sample.bps$intermut.dist[sample.bps.flag])
}
}
if (!logScale) { # even if pcf was run on non-logged distances, I log the output
sample.bps$intermut.dist <- log10(sample.bps$intermut.dist)
sample.bps$mean.intermut.dist <- log10(sample.bps$mean.intermut.dist)
}
# a rearrangement is in a cluster if any of its breakpoints are
sample.bps$is.clustered <- sample.bps$is.clustered.single
sample.bps$is.clustered[sample.bps$id %in% subset(sample.bps, is.clustered.single==TRUE)$id] <- TRUE
# mark both breakpoints of a rearrangement as clustered if any is
sv_bedpe$is.clustered <- sv_bedpe$id %in% sample.bps$id[sample.bps$is.clustered]
result <- list()
result$sv_bedpe <- sv_bedpe
result$kat.regions <- all.kat.regions
result
}
prepare.rearr.catalogue_fromAnnotatedBedpe <- function(sv_bedpe) {
catalogue.labels <- c('clustered_del_1-10Kb', 'clustered_del_10-100Kb', 'clustered_del_100Kb-1Mb', 'clustered_del_1Mb-10Mb', 'clustered_del_>10Mb', 'clustered_tds_1-10Kb', 'clustered_tds_10-100Kb', 'clustered_tds_100Kb-1Mb', 'clustered_tds_1Mb-10Mb', 'clustered_tds_>10Mb', 'clustered_inv_1-10Kb', 'clustered_inv_10-100Kb', 'clustered_inv_100Kb-1Mb', 'clustered_inv_1Mb-10Mb', 'clustered_inv_>10Mb', 'clustered_trans', 'non-clustered_del_1-10Kb', 'non-clustered_del_10-100Kb', 'non-clustered_del_100Kb-1Mb', 'non-clustered_del_1Mb-10Mb', 'non-clustered_del_>10Mb', 'non-clustered_tds_1-10Kb', 'non-clustered_tds_10-100Kb', 'non-clustered_tds_100Kb-1Mb', 'non-clustered_tds_1Mb-10Mb', 'non-clustered_tds_>10Mb', 'non-clustered_inv_1-10Kb', 'non-clustered_inv_10-100Kb', 'non-clustered_inv_100Kb-1Mb', 'non-clustered_inv_1Mb-10Mb', 'non-clustered_inv_>10Mb', 'non-clustered_trans')
all_catalogues <- as.data.frame(matrix(nrow = length(catalogue.labels),ncol = 0))
rownames(all_catalogues) <- catalogue.labels
if (nrow(sv_bedpe)>0){
for (sample_name in unique(sv_bedpe$sample)){
sample.rearrs <- sv_bedpe[sv_bedpe$sample==sample_name,]
rearr_catalogue <- as.data.frame(matrix(0,nrow = length(catalogue.labels),ncol = 1))
if (nrow(sample.rearrs)>0) {
label1 <- rep('non-clustered', nrow(sample.rearrs))
label1[ sample.rearrs$is.clustered] <- 'clustered'
label2 <- rep('', nrow(sample.rearrs))
label2[ sample.rearrs$svclass=='deletion'] <- '_del'
label2[ sample.rearrs$svclass=='translocation'] <- '_trans'
label2[ sample.rearrs$svclass=='inversion'] <- '_inv'
label2[ sample.rearrs$svclass=='tandem-duplication'] <- '_tds'
label3 <- rep('', nrow(sample.rearrs))
sample.rearrs$bkdist <- abs(sample.rearrs$start2 - sample.rearrs$start1)
label3[ sample.rearrs$svclass!='translocation' & sample.rearrs$bkdist<=1e4] <- '_1-10Kb'
label3[ sample.rearrs$svclass!='translocation' & sample.rearrs$bkdist>1e4 & sample.rearrs$bkdist<=1e5 ] <- '_10-100Kb'
label3[ sample.rearrs$svclass!='translocation' & sample.rearrs$bkdist>1e5 & sample.rearrs$bkdist<=1e6 ] <- '_100Kb-1Mb'
label3[ sample.rearrs$svclass!='translocation' & sample.rearrs$bkdist>1e6 & sample.rearrs$bkdist<=1e7 ] <- '_1Mb-10Mb'
label3[ sample.rearrs$svclass!='translocation' & sample.rearrs$bkdist>1e7 ] <- '_>10Mb'
sample.rearrs$catalogue.label <- paste0(label1, label2, label3)
sample.table <- as.data.frame(table( sample.rearrs$catalogue.label),drop=FALSE)
rownames(sample.table ) <- sample.table$Var
rearr_catalogue <- sample.table [as.character(catalogue.labels), 'Freq',drop=FALSE ]
}
rearr.catalogue <- rearr_catalogue
rownames(rearr.catalogue) <- catalogue.labels
colnames(rearr.catalogue) <- sample_name
rearr.catalogue[is.na(rearr.catalogue)] <- 0
all_catalogues <- cbind(all_catalogues,rearr.catalogue)
}
}else{
all_catalogues <- as.data.frame(matrix(0,nrow = length(catalogue.labels),ncol = 1))
}
all_catalogues
}
classifyRearrangementsFromBedpe <- function(sv_bedpe){
svclass <- c()
for (i in 1:nrow(sv_bedpe)){
if(sv_bedpe[i,"chrom1"]!=sv_bedpe[i,"chrom2"]){
svclass <- c(svclass,"translocation")
}else if(sv_bedpe[i,"strand1"]!=sv_bedpe[i,"strand2"]){
svclass <- c(svclass,"inversion")
}else if(sv_bedpe[i,"strand1"]=="+"){
svclass <- c(svclass,"deletion")
}else if(sv_bedpe[i,"strand1"]=="-"){
svclass <- c(svclass,"tandem-duplication")
}
}
sv_bedpe[,"svclass"] <- svclass
#return updated df
sv_bedpe
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.