#=====================================================================
#===== Final analysis: frame status, filters, exon-junction, output
#=====================================================================
SplitFusion.breakpoint.anno.postscript.direction.sub = function(lr3){
n.lr3 = nrow(lr3)
sampleID = sub(".*\\/", "", getwd())
#if (!exists('StrVarMinStartSite')){StrVarMinStartSite=2}
if (n.lr3 >0){
## add the 6 base back to cDNA position
lr3$cdna_L = lr3$cdna_L0 + 6
lr3$cdna_R = lr3$cdna_R0 - 6
## intra-gene
lr3$intragene = ifelse(lr3$gene_L == lr3$gene_R, 1, 0)
lr3$exon = substr(lr3$exon_L,1,4)
lr3$exonD = 0
lr3$exonD = ifelse(lr3$gene_L == lr3$gene_R, lr3$exonn_R - lr3$exonn_L, lr3$exonD)
lr3$gec_L = paste0(lr3$gene_L, ' ', lr3$exon_L, ' c.', lr3$cdna_L, ' (', lr3$nm_L, ')')
lr3$gec_R = paste0(lr3$gene_R, ' ', lr3$exon_R, ' c.', lr3$cdna_R, ' (', lr3$nm_R, ')')
lr3$gec_LR = paste(lr3$gec_L, lr3$gec_R, sep='---')
lr3$gg = paste0(lr3$gene_L, '_', lr3$gene_R)
lr3$ge1 = paste0(lr3$gene_L, '_', lr3$exon_L)
lr3$ge2 = paste0(lr3$gene_R, '_', lr3$exon_R)
lr3$ge1ge2 = paste0(lr3$ge1,'---', lr3$ge2)
##======================
## frame status
##======================
lr3$frameD = (lr3$cdna_R0 %% 3 - (lr3$cdna_L0 - lr3$overlap) %% 3) %% 3
lr3$frame = ifelse(lr3$frameD ==1, 'in-frame', 'out-frame')
lr3$frame[is.na(lr3$frame)] = 'N.A.'
#head(lr3)
## expect mostly in-frame or _NA_. Otherwise, might need inspection
#table(lr3$frame,useNA='always')
lr3$neighb = ifelse(lr3$exonD==1, 1, 0)
lr3$neighb[is.na(lr3$neighb)] = 0
## order
lr3 = lr3[order(lr3$intragene, lr3$frame, lr3$gec_LR),]
##===========================================================
##=== Optional: Backend database supported target output or filtering
##===========================================================
##==== Targeted output known significant fusions or splicing isoforms (e.g. MET ex14 skipping)
known.gg=NA; known.ge=NA; known.partner=NA; known.3UTR=NA;
known.gg.file = paste0(panel_dir, '/known.gene-gene.txt'); if (file.exists(known.gg.file)){known.gg = readLines(known.gg.file)}
known.ge.file = paste0(panel_dir, '/known.gene-exon.txt'); if (file.exists(known.ge.file)){known.ge = readLines(known.ge.file)}
known.partner.file = paste0(panel_dir, '/known.partners.txt'); if (file.exists(known.partner.file)){known.partner = readLines(known.partner.file)}
known.3UTR.truncation.file = paste0(panel_dir, '/known.3UTR.truncation.txt'); if (file.exists(known.3UTR.truncation.file)){known.3UTR = readLines(known.3UTR.truncation.file)}
##==== Targeted remove unwanted fusions (e.g. involving homologous genes or recurrent falsed positives)
filter.gg=NA; filter.ge=NA;
filter.gg.file = paste0(panel_dir, '/filter.gene-gene.txt'); if (file.exists(filter.gg.file)){filter.gg0 = read.table(filter.gg.file, header=F, stringsAsFactors=F);
filter.gg = c(paste(filter.gg0$V1, filter.gg0$V2, sep='_'), paste(filter.gg0$V2, filter.gg0$V1, sep='_'))}
filter.ge.file = paste0(panel_dir, '/filter.gene-exon.txt'); if (file.exists(filter.ge.file)){filter.ge = readLines(filter.ge.file)}
lr3$known=0
lr3$known[(lr3$gene_T == lr3$gene_L & lr3$gene_R %in% known.partner & lr3$intragene==0)] = 1
lr3$known[(lr3$gene_T == lr3$gene_R & lr3$gene_L %in% known.partner & lr3$intragene==0)] = 1
lr3$known[(lr3$ge1ge2 %in% known.ge)] = 1
lr3$known[(lr3$ge1 %in% known.3UTR & lr3$intragene==0)] = 1
lr32 = subset(lr3, (known==1 | (num_partner_ends >= as.numeric(FusionMinStartSite) & intragene==0)) & !(gg %in% filter.gg | ge1 %in% filter.ge | ge2 %in% filter.ge))
# nrow(lr3); nrow(lr32)
# output for furture research
write.table(lr32, paste(sampleID, '.fusion.list.pre-processing.txt', sep=''), row.names=F, quote=F, sep='\t')
##======================
## Further filters
##======================
## To remove:
# - read-through (defined as within 100K distance on chrosome)
# - homologous genes
# - neighbour exons of same gene
# - out-frame (keep out-frame of different genes. FGFRs fusion can be functional even out-frame)
# - same exon
#
## To calculate:
# - exon boundary/junction
if (nrow(lr32)>0){
lr32$read.through = 0
lr32$read.through[lr32$gene_L != lr32$gene_R
& lr32$chr_L == lr32$chr_R
& abs(lr32$pos_L - lr32$pos_R)<100000
]=TRUE
lr32$gene_Lchr = gsub('[0-9]', '', lr32$gene_L)
lr32$gene_Rchr = gsub('[0-9]', '', lr32$gene_R)
homolog.match = function(row){
h1 = pmatch(row[1], row[2])
h2 = pmatch(row[2], row[1])
h3 = (h1 || h2)
return(h3)
}
lr32$homolog = apply(lr32[,c('gene_Lchr', 'gene_Rchr')], 1, homolog.match)
lr32$homolog[is.na(lr32$homolog)] = 0
lr32$homolog[lr32$known==1] = 0
lr40 = subset(lr32, !(read.through | homolog | neighb==1 | (intragene==1 & exon_L==exon_R)
| (intragene==1 & frame=='out-frame')
))
lr4 = lr40[!duplicated(lr40$readID),]
n.lr40 = nrow(lr40)
n.lr4 = nrow(lr4)
if (n.lr4>0){
lr4$SampleID = sampleID
# remove illegal symbol
lr4$gec_LR = gsub('\\(', '.', lr4$gec_LR)
lr4$gec_LR = gsub('\\)', '.', lr4$gec_LR)
lr4$ge1ge2 = gsub('\\(', '.', lr4$ge1ge2)
lr4$ge1ge2 = gsub('\\)', '.', lr4$ge1ge2)
##======================
## exon junction support based on RefGene exon starts/ends
##======================
bk = lr4[, c('breakpoint', 'gene_L', 'pos_L', 'gene_R', 'pos_R')]
# most representative pos
bk2 = ddply(bk, .(breakpoint, gene_L, gene_R), summarize
, 'pos_L' = names(sort(table(pos_L), decreasing = TRUE)[1])
, 'pos_R' = names(sort(table(pos_R), decreasing = TRUE)[1]))
bk.genes = unique(c(bk2$gene_L, bk2$gene_R))
refGene = read.table(paste0(path.package("SplitFusion"), '/data/refGene0.txt'), header=T, sep='\t')
refGene2 = subset(refGene, name2 %in% bk.genes)[,c('name2', 'exonStarts', 'exonEnds')]
toStarts = function(row){
dist = as.numeric(unlist(strsplit(as.character(row[1]),','))) - as.numeric(row[2])
if (any(dist %in% 0:2)) {junction=1} else {junction=0}
return(junction)
}
fromEnds = function(row){
dist = as.numeric(unlist(strsplit(as.character(row[1]),','))) - as.numeric(row[2])
if (any(dist %in% -2:0)) {junction=1} else {junction=0}
return(junction)
}
junc.L = unique(merge(bk2[, c('breakpoint', 'gene_L', 'pos_L')], refGene2, by.x='gene_L', by.y='name2', all.x=T))
junc.L$distLstart = apply(junc.L[,c('exonStarts', 'pos_L')], 1, toStarts)
junc.L$distLend = apply(junc.L[,c('exonEnds', 'pos_L')], 1, fromEnds)
junc.R = unique(merge(bk2[, c('breakpoint', 'gene_R', 'pos_R')], refGene2, by.x='gene_R', by.y='name2', all.x=T))
junc.R$distRstart = apply(junc.R[,c('exonStarts', 'pos_R')], 1, toStarts)
junc.R$distRend = apply(junc.R[,c('exonEnds', 'pos_R')], 1, fromEnds)
junc.LR = merge(junc.L, junc.R, by='breakpoint')[,c('breakpoint', 'distLstart', 'distLend', 'distRstart', 'distRend')]
junc.LR$exon.junction = junc.LR$distLstart + junc.LR$distLend + junc.LR$distRstart + junc.LR$distRend
#lr5 = merge(lr4, unique(junc.LR[,c('breakpoint', 'exon.junction')]), by='breakpoint')
lr5 = merge(lr4, unique(junc.LR[,c('breakpoint', 'exon.junction')]), by='breakpoint', allow.cartesian=TRUE)
## output
lr5 = lr5[order(-lr5$exon.junction, -lr5$known, -lr5$num_partner_ends, lr5$intragene, lr5$frame),]
fusion.list = subset(lr5, select=c('SampleID', 'ge1ge2', 'frame','gec_LR','chr_L','pos_L','inEx_L','gene_L','nm_L','exon_L','cdna_L'
, 'overlap','chr_R','pos_R','inEx_R','gene_R','nm_R','exon_R','cdna_R','intragene','readID'
, 'breakpoint', 'num_partner_ends', 'known', 'exon.junction'))
write.table(fusion.list, paste(sampleID, '.fusion.list.post-processing.txt', sep=''), row.names=F, quote=F, sep='\t')
## consolidate breakpoint
fusion.list[is.na(fusion.list)]='-'
fusion.list$readID2 = sub(':umi:.*', '', fusion.list$readID)
fusion.tab1 = ddply(fusion.list, .(breakpoint, SampleID, ge1ge2, frame, nm_L, inEx_L, gene_L, cdna_L, nm_R, inEx_R, gene_R, cdna_R, intragene, overlap, num_partner_ends, known), summarize
, 'num_unique_reads'=length(unique(readID2)), 'exon.junction' = max(exon.junction)
)
fusion.tab1$transcript_L = fusion.tab1$nm_L
fusion.tab1$transcript_R = fusion.tab1$nm_R
fusion.tab1$function_L = fusion.tab1$inEx_L
fusion.tab1$function_R = fusion.tab1$inEx_R
fusion.tab1 = fusion.tab1[order(-fusion.tab1$exon.junction, -fusion.tab1$known, -fusion.tab1$num_partner_ends, fusion.tab1$intragene, fusion.tab1$frame, -fusion.tab1$num_unique_reads),]
## keep first of same breakpoint (exon.junction, most abundant nss, reads)
fusion.tab2 = fusion.tab1[!duplicated(fusion.tab1$breakpoint),]
head(fusion.tab2)
out.names = c("SampleID", "ge1ge2", "frame", "num_partner_ends", "num_unique_reads", "exon.junction", "breakpoint"
, "transcript_L", "transcript_R", "function_L", "function_R", "gene_L", "cdna_L", "gene_R", "cdna_R", "intragene", "known")
fusion.table = fusion.tab2[, out.names]
fusion.table = fusion.table[order(-fusion.tab2$exon.junction, -fusion.table$known, fusion.table$frame, fusion.table$intragene, -fusion.table$num_partner_ends),]
name1 = names(fusion.table)
name2 = sub('_L', "_5", name1)
name3 = sub('_R', "_3", name2)
name4 = sub('ge1ge2', "GeneExon5_GeneExon3", name3)
names(fusion.table) = name4
fusion.table$exon.junction[fusion.table$exon.junction==2] = 'Both'
fusion.table$exon.junction[fusion.table$exon.junction==1] = 'One'
write.table(fusion.table, paste(sampleID, '.fusion.table.NoFilter.txt', sep=''), row.names=F, quote=F, sep='\t')
##=======================================
## Filter: exon boundary
##=======================================
# inter-gene, in-frame
# or known
fusion.table$g5g3 = paste(fusion.table$gene_5, fusion.table$gene_3, sep='_')
minPartnerEnds_BothExonJunction = as.numeric(minPartnerEnds_BothExonJunction)
minPartnerEnds_OneExonJunction = as.numeric(minPartnerEnds_OneExonJunction)
ex = subset(fusion.table, (known | (g5g3 %in% known.gg & (exon.junction != '0') | frame =='gDNA') ## for future compatibility with gDNA reads
| (exon.junction == 'Both'
& (known ==1 | (frame =='in-frame' & num_partner_ends >= minPartnerEnds_BothExonJunction
& (num_partner_ends >= num_unique_reads | num_partner_ends >= minPartnerEnds_OneExonJunction))))
| (exon.junction == 'One'
& frame != 'out-frame'
& num_partner_ends >= minPartnerEnds_OneExonJunction
& (known ==1 | (intragene==0 & frame =='in-frame')))
))[, 1:15]
# Output 1st of records with same GeneExon5_GeneExon3
ex1 = ex[order(ex$frame, -ex$num_partner_ends, -ex$num_unique_reads),]
write.table(ex1, paste(sampleID, '.brief.summary', sep=''), row.names=F, quote=F, sep='\t')
##=======================================
## export max 10 example fusion reads
##=======================================
ex1 = read.table(paste(sampleID, '.brief.summary', sep=''), sep='\t', stringsAsFactors=F, header=T)
ex2 = ex1[!duplicated(ex1$"GeneExon5_GeneExon3"),]
# ex2 = subset(ex2, frame != 'gDNA')
(nex = min(10, nrow(ex2)))
if (nex >0){
## show max 25 example reads
for (i in 1:nex){
## get read ID
(gei = ex2$"GeneExon5_GeneExon3"[i])
(ngeci = ex2$num_unique_reads[i])
readids = unique(lr5$readID[lr5$ge1ge2 == gei])
(sample.n = min(10, ngeci, length(readids)))
(readid = sample(unique(lr5$readID[lr5$ge1ge2 == gei]), sample.n))
writeLines(readid, 'tmp.readid')
system('sed -e "s/:umi.*//" -e "s:/1.*::" -e "s:/2.*::" tmp.readid | sort -u > tmp.readid2')
## get fa
if (file.exists(paste0(sampleID, '.consolidated.bam'))){
bamfile=paste0(sampleID, '.consolidated.bam')
}else{ bamfile=paste0(bam_dir, "/", sampleID, ".consolidated.bam")}
# system(paste0(path.package('SplitFusion'), "/data/Database/samtools view ",
system(paste0(samtools, " view ",
bamfile, " | grep -f tmp.readid2 | cut -f1,10 | sed 's/^/>/' |\
awk '{$3=length($2); print $0}' | sort -k1,1b -k3,3nr | sort -k1,1b -u |\
cut -d ' ' -f1,2 | tr ' ' '\n' | sed 's/:umi.*//' > "
, sampleID, '.', gei, ".txt", sep=''))
# system(paste0(path.package('SplitFusion'), "/data/Database/samtools view ", ###Modified by Baifeng###
system(paste0(samtools, " view ",
# bamfile, " | grep -f tmp.readid2 | ", paste0(path.package('SplitFusion'), "/data/Database/samtools view -T "), database_dir,"/", refGenome, " -bS - > ", sampleID, '.', gei, ".bam", sep=''))
bamfile, " | grep -f tmp.readid2 | ", paste0(samtools, " view -T "), database_dir,"/", refGenome, " -bS - > ", sampleID, '.', gei, ".bam", sep=''))
# system(paste0(path.package('SplitFusion'), "/data/Database/samtools index ", ###Modified by Baifeng###
system(paste0(samtools, " index ",
sampleID, '.', gei, ".bam"))
}
system('rm tmp.readid*')
}
}
}
}
##==== make empty table for negative samples
if (!file.exists(paste(sampleID, '.brief.summary', sep=''))){
file.create(paste(sampleID, '.brief.summary', sep=''))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.