R/SplitFusion.breakpoint.anno.postscript.direction.sub.R

Defines functions SplitFusion.breakpoint.anno.postscript.direction.sub extractBam

#=====================================================================
#===== Final analysis: frame status, filters, exon-junction, output
#=====================================================================

library(parallel)

extractBam <- function(i,ex2,lr5) {
source('config.txt')
sampleID = sub(".*\\/", "", getwd())
## get read ID
	(gei = ex2$"GeneExon5_GeneExon3"[i])
	(ngeci = ex2$num_unique_reads[i])
	readids = unique(lr5$readID[lr5$ge1ge2 == gei])
	(sample.n = min(10, ngeci, length(readids)))
	(readid = sample(unique(lr5$readID[lr5$ge1ge2 == gei]), sample.n))
	writeLines(readid, paste0(sampleID,'.',gei,'.tmp.readid'))
	system(paste0('sed -e "s/:umi.*//" -e "s:/1.*::" -e "s:/2.*::" ',sampleID,'.',gei,'.tmp.readid | sort -u > ',sampleID,'.',gei,'.tmp.readid2'))

## get fa
	if (file.exists(paste0(sampleID, '.consolidated.bam'))){
		bamfile=paste0(sampleID, '.consolidated.bam')
	}else{ bamfile=paste0(output, "/", sampleID, ".consolidated.bam")}
	system(paste0(samtools, " view ", bamfile, " | grep -f ",sampleID,".",gei,".tmp.readid2 | cut -f1,10 | sed 's/^/>/' |\
		awk '{$3=length($2); print $0}' | sort -k1,1b -k3,3nr | sort -k1,1b -u |\
		cut -d ' ' -f1,2 | tr ' ' '\n' | sed 's/:umi.*//' > "
							, sampleID, '.', gei, ".txt", sep=''))
	system(paste0(samtools, " view ", bamfile, " | grep -f ",sampleID,".",gei,".tmp.readid2 | ", paste0(samtools, " view -T "), refGenome, " -bS - > ", sampleID, '.', gei, ".bam", sep=''))
	system(paste0(samtools, " index ",
                                                        sampleID, '.', gei, ".bam"))
	system(paste0('rm ',sampleID,'.',gei,'.tmp.readid*'))
}

SplitFusion.breakpoint.anno.postscript.direction.sub = function(lr3){
source('config.txt')
n.lr3 = nrow(lr3)
sampleID = sub(".*\\/", "", getwd())

if (n.lr3 >0){
	## add the 6 base back to cDNA position
	lr3$cdna_L = lr3$cdna_L0 + 6
	lr3$cdna_R = lr3$cdna_R0 - 6

	## intra-gene
	lr3$intragene = ifelse(lr3$gene_L == lr3$gene_R, 1, 0)
	lr3$exon = substr(lr3$exon_L,1,4)
	lr3$exonD = 0
	lr3$exonD = ifelse(lr3$gene_L == lr3$gene_R, lr3$exonn_R - lr3$exonn_L, lr3$exonD)
	lr3$gec_L = paste0(lr3$gene_L, ' ', lr3$exon_L, ' c.', lr3$cdna_L, ' (', lr3$nm_L, ')')
	lr3$gec_R = paste0(lr3$gene_R, ' ', lr3$exon_R, ' c.', lr3$cdna_R, ' (', lr3$nm_R, ')')
	lr3$gec_LR = paste(lr3$gec_L, lr3$gec_R, sep='---')
	lr3$gg = paste0(lr3$gene_L, '_', lr3$gene_R)
	lr3$ge1 = paste0(lr3$gene_L, '_', lr3$exon_L)
	lr3$ge2 = paste0(lr3$gene_R, '_', lr3$exon_R)
	lr3$ge1ge2 = paste0(lr3$ge1,'---', lr3$ge2)

	##======================
	## frame status
	##======================
	    lr3$frameD = (lr3$cdna_R0 %% 3 - (lr3$cdna_L0 - lr3$overlap) %% 3) %% 3
	    lr3$frame = ifelse(lr3$frameD ==1, 'in-frame', 'out-frame')
	    lr3$frame[is.na(lr3$frame)] = 'N.A.'
		#head(lr3)
		## expect mostly in-frame or _NA_. Otherwise, might need inspection
		#table(lr3$frame,useNA='always')
				     
	    lr3$neighb = ifelse(lr3$exonD==1, 1, 0)
	    lr3$neighb[is.na(lr3$neighb)] = 0

		## order
		lr3 = lr3[order(lr3$intragene, lr3$frame, lr3$gec_LR),]

	##===========================================================
	##=== Optional: Backend database supported target output or filtering
	##===========================================================
	    ##==== Targeted output known significant fusions or splicing isoforms (e.g. MET ex14 skipping)
		known.gg=NA; known.ge=NA; known.partner=NA; known.3UTR=NA; known.k=NA;
	    known.gg.file = paste0(panel_dir, '/known.gene-gene.txt'); if (file.exists(known.gg.file)){known.gg = readLines(known.gg.file)}
	    known.ge.file = paste0(panel_dir, '/known.gene-exon.txt'); if (file.exists(known.ge.file)){known.ge = readLines(known.ge.file)}
	    known.partner.file = paste0(panel_dir, '/known.partners.txt'); if (file.exists(known.partner.file)){known.partner = readLines(known.partner.file)}
	    known.3UTR.truncation.file = paste0(panel_dir, '/known.3UTR.truncation.txt'); if (file.exists(known.3UTR.truncation.file)){known.3UTR = readLines(known.3UTR.truncation.file)} 
	    known.k.file = paste0(panel_dir, '/known.priority.txt'); if (file.exists(known.k.file)){known.k = readLines(known.k.file)}
	    ##==== Targeted remove unwanted fusions (e.g. involving homologous genes or recurrent falsed positives)
		filter.gg=NA; filter.ge=NA; 
	    filter.gg.file = paste0(panel_dir, '/filter.gene-gene.txt'); if (file.exists(filter.gg.file)){filter.gg0 = read.table(filter.gg.file, header=F, stringsAsFactors=F);
				 filter.gg = c(paste(filter.gg0$V1, filter.gg0$V2, sep='_'), paste(filter.gg0$V2, filter.gg0$V1, sep='_'))} 
	    filter.ge.file = paste0(panel_dir, '/filter.gene-exon.txt'); if (file.exists(filter.ge.file)){filter.ge = readLines(filter.ge.file)}

	    lr3$known=0
	    lr3$known[(lr3$gene_T == lr3$gene_L & lr3$gene_R %in% known.partner & lr3$intragene==0)] = 1
	    lr3$known[(lr3$gene_T == lr3$gene_R & lr3$gene_L %in% known.partner & lr3$intragene==0)] = 1
	    lr3$known[(lr3$ge1ge2 %in% known.ge)] = 1
	    lr3$known[(lr3$ge1 %in% known.3UTR & lr3$homolog==0 & lr3$intragene==0)] = 1
	    lr32 = subset(lr3, (known==1 | (num_partner_ends >= as.numeric(FusionMinStartSite) & intragene==0)) & !(gg %in% filter.gg | ge1 %in% filter.ge | ge2 %in% filter.ge))
		    # nrow(lr3); nrow(lr32)
		    # output for furture research
		    write.table(lr32, paste(sampleID, '.fusion.list.pre-processing.txt', sep=''), row.names=F, quote=F, sep='\t')

	##======================
	## Further filters
	##======================
	## To remove:
	#	- read-through (defined as within 100K distance on chrosome)
	#	- homologous genes
	#	- neighbour exons of same gene
	#	- out-frame (keep out-frame of different genes. FGFRs fusion can be functional even out-frame)
	#	- same exon
	#
	## To calculate:
	#	- exon boundary/junction
	if (nrow(lr32)>0){
# read.through limit reduced to 84000 to allow RPS6KB1--VMP1 in LUAD-62-A46O
# further reduce to 76000 to allow ERBB2--PPP1R1B in BLCA-DK-A1A3
		lr32$read.through = 0
		lr32$read.through[lr32$gene_L != lr32$gene_R 
					& lr32$chr_L == lr32$chr_R
					& lr32$strand_L == lr32$strand_R
					& lr32$geneStrand_L == lr32$geneStrand_R
                                        & (lr32$geneStrand_L == '+' & lr32$pos_R - lr32$pos_L >= 0 & lr32$pos_R - lr32$pos_L < 100000 |
                                           lr32$geneStrand_L == '-' & lr32$pos_L - lr32$pos_R >= 0 & lr32$pos_L - lr32$pos_R < 100000)
					]=TRUE

		homolog.match = function(row){
				h1 = pmatch(substr(row[1],1,3), row[2])
				h2 = pmatch(substr(row[2],1,3), row[1])
				h3 = (h1 || h2)
              			return(h3)
			}
		lr32$homolog = apply(lr32[,c('gene_L', 'gene_R')], 1, homolog.match)
		lr32$homolog[is.na(lr32$homolog)] = 0
		lr32$homolog[lr32$known==1] = 0

		lr40 = subset(lr32, !(read.through | homolog | neighb==1 | (intragene==1 & exon_L==exon_R)
					| (intragene==1 & frame=='out-frame')
					))
		lr4 = lr40[!duplicated(lr40$readID),]
		n.lr40 = nrow(lr40)
		n.lr4 = nrow(lr4)
		
		if (n.lr4>0){
			lr4$SampleID = sampleID
			# remove illegal symbol
			lr4$gec_LR = gsub('\\(', '.', lr4$gec_LR)
			lr4$gec_LR = gsub('\\)', '.', lr4$gec_LR)
			lr4$ge1ge2 = gsub('\\(', '.', lr4$ge1ge2)
			lr4$ge1ge2 = gsub('\\)', '.', lr4$ge1ge2)

			##======================
			## exon junction support based on RefGene exon starts/ends
			##======================
				bk = lr4[, c('breakpoint', 'gene_L', 'pos_L', 'gene_R', 'pos_R')]
					# most representative pos
				bk2 = ddply(bk, .(breakpoint, gene_L, gene_R), summarize
					, 'pos_L' = names(sort(table(pos_L), decreasing = TRUE)[1])
					, 'pos_R' = names(sort(table(pos_R), decreasing = TRUE)[1]))
					
					bk.genes = unique(c(bk2$gene_L, bk2$gene_R))
				refGene = read.table(paste0(SFpath, '/inst/data/', genomeVer, '_refGene0.txt'), header=T, sep='\t')
				refGene2 = subset(refGene, name2 %in% bk.genes)[,c('name2', 'exonStarts', 'exonEnds')]

					toStarts = function(row){
						dist = as.numeric(unlist(strsplit(as.character(row[1]),',')))+1 - as.numeric(row[2])
						if (any(dist %in% 0:2)) {junction=1} else {junction=0}
						return(junction)
						}

					fromEnds = function(row){
						dist = as.numeric(unlist(strsplit(as.character(row[1]),','))) - as.numeric(row[2])
						if (any(dist %in% -2:0)) {junction=1} else {junction=0}
						return(junction)
						}
					
				junc.L = unique(merge(bk2[, c('breakpoint', 'gene_L', 'pos_L')], refGene2, by.x='gene_L', by.y='name2', all.x=T))
					junc.L$distLstart = apply(junc.L[,c('exonStarts', 'pos_L')], 1, toStarts)
					junc.L$distLend = apply(junc.L[,c('exonEnds', 'pos_L')], 1, fromEnds)

				junc.R = unique(merge(bk2[, c('breakpoint', 'gene_R', 'pos_R')], refGene2, by.x='gene_R', by.y='name2', all.x=T))
					junc.R$distRstart = apply(junc.R[,c('exonStarts', 'pos_R')], 1, toStarts)
					junc.R$distRend = apply(junc.R[,c('exonEnds', 'pos_R')], 1, fromEnds)

				junc.LR = merge(junc.L, junc.R, by='breakpoint')[,c('breakpoint', 'distLstart', 'distLend', 'distRstart', 'distRend')]
				junc.LR$exon.junction = junc.LR$distLstart + junc.LR$distLend + junc.LR$distRstart + junc.LR$distRend

				#lr5 = merge(lr4, unique(junc.LR[,c('breakpoint', 'exon.junction')]), by='breakpoint')
				lr5 = merge(lr4, unique(junc.LR[,c('breakpoint', 'exon.junction')]), by='breakpoint', allow.cartesian=TRUE)

		## output
			lr5 = lr5[order(-lr5$exon.junction, -lr5$known, -lr5$num_partner_ends, lr5$intragene, lr5$frame),]
			fusion.list = subset(lr5, select=c('SampleID', 'ge1ge2', 'frame','gec_LR','chr_L','pos_L','inEx_L','gene_L','nm_L','exon_L','cdna_L'
						, 'overlap','chr_R','pos_R','inEx_R','gene_R','nm_R','exon_R','cdna_R','intragene','readID'
						, 'breakpoint', 'num_partner_ends', 'known', 'exon.junction'))
				write.table(fusion.list, paste(sampleID, '.fusion.list.post-processing.txt', sep=''), row.names=F, quote=F, sep='\t')

			## consolidate breakpoint
				fusion.list[is.na(fusion.list)]='-'
				fusion.list$readID2 = sub(':umi:.*', '', fusion.list$readID)
			fusion.tab1 = ddply(fusion.list, .(breakpoint, SampleID, ge1ge2, frame, nm_L, inEx_L, gene_L, cdna_L, nm_R, inEx_R, gene_R, cdna_R, intragene, overlap, num_partner_ends, known), summarize
							, 'num_unique_reads'=length(unique(readID2)), 'exon.junction' = max(exon.junction)
							)
				fusion.tab1$transcript_L = fusion.tab1$nm_L
				fusion.tab1$transcript_R = fusion.tab1$nm_R
				fusion.tab1$function_L = fusion.tab1$inEx_L
				fusion.tab1$function_R = fusion.tab1$inEx_R
			fusion.tab1 = fusion.tab1[order(-fusion.tab1$exon.junction, -fusion.tab1$known, -fusion.tab1$num_partner_ends, fusion.tab1$intragene, fusion.tab1$frame, -fusion.tab1$num_unique_reads),]

			## keep first of same breakpoint (exon.junction, most abundant nss, reads)
			fusion.tab2 = fusion.tab1[!duplicated(fusion.tab1$breakpoint),]
				head(fusion.tab2)

		out.names = c("SampleID", "ge1ge2", "frame", "num_partner_ends", "num_unique_reads", "exon.junction", "breakpoint"
				, "transcript_L", "transcript_R", "function_L", "function_R", "gene_L", "cdna_L", "gene_R", "cdna_R", "intragene", "known")
		fusion.table = fusion.tab2[, out.names]
		fusion.table = fusion.table[order(-fusion.tab2$exon.junction, -fusion.table$known, fusion.table$frame, fusion.table$intragene, -fusion.table$num_partner_ends),]
				name1 = names(fusion.table)
				name2 = sub('_L', "_5", name1)
				name3 = sub('_R', "_3", name2)
				name4 = sub('ge1ge2', "GeneExon5_GeneExon3", name3)
				names(fusion.table) = name4
				fusion.table$exon.junction[fusion.table$exon.junction==2] = 'Both'
				fusion.table$exon.junction[fusion.table$exon.junction==1] = 'One'
			write.table(fusion.table, paste(sampleID, '.fusion.table.NoFilter.txt', sep=''), row.names=F, quote=F, sep='\t')

	##=======================================
	## Filter: exon boundary
	##=======================================
		    # inter-gene, in-frame
		    # or known
			fusion.table$g5g3 = paste(fusion.table$gene_5, fusion.table$gene_3, sep='_')
			minPartnerEnds_BothExonJunction = as.numeric(minPartnerEnds_BothExonJunction)
			minPartnerEnds_OneExonJunction = as.numeric(minPartnerEnds_OneExonJunction)
				
			ex = subset(fusion.table, (known | (g5g3 %in% known.gg & (exon.junction != '0') | frame =='gDNA') ## for future compatibility with gDNA reads
							 | (exon.junction == 'Both' 
							    & (known ==1 | ((frame == 'in-frame' | (frame == 'N.A.' & grepl("exon[1-9]", GeneExon5_GeneExon3) & grepl("UTR", GeneExon5_GeneExon3))) & num_partner_ends >= minPartnerEnds_BothExonJunction 
										& (num_partner_ends >= num_unique_reads | num_partner_ends >= minPartnerEnds_OneExonJunction))))
							 | (exon.junction == 'One' 
							    & frame != 'out-frame'
							    & num_partner_ends >= minPartnerEnds_OneExonJunction
							    & (known ==1 | (intragene==0 & frame =='in-frame')))
					))[, 1:15]

			# Output 1st of records with same GeneExon5_GeneExon3
			ex1 = ex[order(ex$frame, -ex$num_partner_ends, -ex$num_unique_reads),]
				write.table(ex1, paste(sampleID, '.brief.summary', sep=''), row.names=F, quote=F, sep='\t')
			
			
	##=======================================
	## export fusion reads involving known kinase genes
	## If the number of such fusions are fewer than 
        ## number of threads, extract the top fusions
	## from the fusions not involing kinase
	##=======================================
			ex1 = read.table(paste(sampleID, '.brief.summary', sep=''), sep='\t', stringsAsFactors=F, header=T)
			ex2 = ex1[!duplicated(ex1$"GeneExon5_GeneExon3"),]
			# ex2 = subset(ex2, frame != 'gDNA')
			# fusions involving known kinase 
			ex3 = ex2[ex2$gene_5 %in% known.k | ex2$gene_3 %in% known.k,]
			# fusions not involing known kinase
			ex4 = ex2[!(ex2$gene_5 %in% known.k | ex2$gene_3 %in% known.k),]
			# minimum fusions to export is 10
			nex = 10
			# at the minimum, we will display all kinase fusions
			if (dim(ex3)[1] > nex) {
				nex = dim(ex3)[1]
			}
			ex5 = rbind(ex3,ex4)
			if (nex >0){
				## show max 25 example reads 
				cores = min(detectCores(),as.numeric(thread))
				mclapply(1:nex,extractBam,ex2=ex5,lr5=lr5,mc.cores=cores)
		        }
		}
	}
}

##==== make empty table for negative samples
if (!file.exists(paste(sampleID, '.brief.summary', sep=''))){
	file.create(paste(sampleID, '.brief.summary', sep=''))
}
}
Zheng-NGS-Lab/SplitFusion documentation built on March 29, 2025, 11:01 a.m.