#Jenny Smith
#Sept. 17, 2020
#purpose: examine junction sequences in RNA-seq reads.
###### Subset BAMS
#' Subset a bam for specific chromome(s) or ranges based on input breakpoint or fusion data file.
#'
#' @param Sample_ID character string of patient ID
#' @param breakpoint character vector with breakpoint string (can be named)
#' @param fusion_data dataframe with fusion results from CICERO, TransAbyss, and STAR-Fusion concatenated
#' @param file_manifest dataframe with Sample_ID and the full file path of the BAM file.
#' @param by_chr boolean - get reads mapped to the entire chromosome containing the breakpoint or just +/- 1e6 bp from breakpoint
#' @param scan boolean - read into memory or not
#' @param Seqlens named numeric vector with chromosome lengths
#' @param outdir destination for subsetted bam files if scan = FALSE
#'
#' @return
#' @export
#'
#' @examples
#' \dontrun{
#' my_files_df <- data.frame(Sample="PAZZUM.03A.01R", filepath=dir("/path/to/sample.bam"))
#' subset_bam("PAZZUM.03A.01R", breakpoint=list("16:1250000|16:1260000"), my_files_df)
#'}
#' @import dplyr
subset_bam <- function(Sample_ID, breakpoint=NULL,
fusion_data=NULL,
file_manifest,
by_chr=FALSE, scan=FALSE,Seqlens=NULL,
outdir=file.path("/fh/scratch/delete90/jlsmith3/subset_bams")){
if(by_chr){
offset <- FALSE
if(is.null(Seqlens)){
message("For whole chromosome subset, must provide seqlens vector.")
return(Seqlens)
}
}else{
offset <- TRUE
}
if(all(is.null(breakpoint), is.null(fusion_data))){
message("Breakpoint and fusion data cannot both be NULL. Please provide either a breakpoint as character vector
with the format of `chr:basepairs|chr:basepairs`, for example c(CBFB.MYH11=\"16:15814908|16:67116211\", KMT2A.MLLT10=\"10:21959378|11:118353210\").
Or fusion data is a dataframe with the fusion breakpoints.")
return(NULL)
}
if(!is.null(fusion_data)){
#Define samples and fusions
fusion_data <- filter(fusion_data, Sample==Sample_ID)
sample <- unique(pull(fusion_data,"Sample"))
fusion <- paste(pull(fusion_data, "Fusion.Category"), collapse = "_")
print(c(sample,fusion))
#Define breakpoints
breakpoint <- pull(fusion_data, "breakpoint.TA")
breakpoint <- ifelse(is.na(breakpoint), pull(fusion_data, "Breakpoints.STAR"), breakpoint)
#Define output file name
fname <- paste0(Sample_ID,"_",fusion,".bam")
}else{
#Define output file name
fname <- paste0(Sample_ID,"_subset.bam")
}
create_ranges <- function(x,offset=TRUE){
if(offset){
#plus/minus 1 million bp from the breakpoint
pos <- as.numeric(x[2])
pos <- IRanges::IRanges(start=pos-1e6, end=pos+1e6)
}else{
chr <- x[1]
#this is sepecific to GRCh37-lite and reference from Starfusion CTAT.
#Will need to make more generalized ASAP.
pattern <- paste(c(paste0("chromosome ",chr,","),
paste0(chr,"\\s[0-9XYMT]{1,2}$")),
collapse = "|")
pos <- IRanges::IRanges(start=0, end=Seqlens[grep(pattern,names(Seqlens))])
}
return(pos)
}
#Define breakpoints and basepair ranges
breakpoint <- stringr::str_split(breakpoint,"\\|")
if(length(breakpoint) > 1){
#Need to add a testthat test to check that 1) the breakpoint is in the correct string format, and the input is a list.
#Need to test that the seqnames are in the BAM file (eg chr13 vs 13 NCBI format) and change as necessary.
seqnames <- sapply(breakpoint,
function(chr_pos) sapply(stringr::str_split(chr_pos,":"),function(chr) chr[1]))
chr_ranges <- unlist(lapply(breakpoint,
function(chr_pos) lapply(stringr::str_split(chr_pos,":"), create_ranges, offset=offset)))
}else{
breakpoint <- breakpoint[[1]]
seqnames <- sapply(stringr::str_split(breakpoint,":"),function(x) x[1])
chr_ranges <- lapply(stringr::str_split(breakpoint,":"),create_ranges, offset=offset)
}
#Define location of bam file
bam <- filter(file_manifest,Sample==Sample_ID) %>%
pull(filepath)
if(length(bam) > 1){
message("File manifest has duplicates! Must have one bam file per patient ID.")
return(filter(file_manifest,Sample==Sample_ID))
}
bam_file <- Rsamtools::BamFile(bam)
if(length(bam_file$index) == 0){
#does NOT listen to the destination parameter... could cause issues.
Rsamtools::indexBam(bam, destination=dest)
bam_file <- Rsamtools::BamFile(bam,asMates=TRUE)
}
#Define the ranges to subset
n <- length(seqnames)
ranges <- unlist(IRanges::IRangesList(chr_ranges))
gr <- GenomicRanges::GRanges(seqnames = S4Vectors::Rle(seqnames,rep(1,n)),
ranges = ranges)
gr <- unique(gr) #incase of intrachromosomal fusions, dont want to duplicate the same ranges
# define parameters. Check other options to filter by with scanBamWhat()
params <- Rsamtools::ScanBamParam(which = gr, what = Rsamtools::scanBamWhat())
dest <- outdir
if(scan){
#reads in the subsetted bam file into memory
bam_reads <- Rsamtools::scanBam(bam_file, param = params)
return(bam_reads)
}else{
#saves the subsetted bam file
Rsamtools::filterBam(bam_file, param = params,
destination = paste0(dest,fname),
indexDestination=FALSE)
}
}
#### Create pdict for sequence searches
#' Create Biostrings pdict (pattern dictionary) from a dataframe for effecient sequence searches of RNAseq reads.
#'
#' @param theoretical_seqs_df a dataframe with computationally derived exon junction sequences
#'
#' @return
#' @export
#'
#' @examples
#'my_seqs <- data.frame(Name=as.character(1:10),Sequence=rep(c("ATCGCCCGTTA"), 10))
#'my_pdict_obj <- create_custom_pdict(theoretical_seqs_df=my_seqs)
#'
#' @import dplyr
create_custom_pdict <- function(theoretical_seqs_df){
#computationsal Breakpoint sequences per transcript isoform to search for
#this will need to be run 2-4x bc the nearly 10k sequences requires way too much memory
nr <- nrow(theoretical_seqs_df)
print(paste("There are ",nr," fusion sequences to examine."))
if(nr > 500){
slices <- split(1:nr, cut(1:nr, 4, labels = FALSE))
}else{
slices <- list(1:nr)
}
#Create PDict object with the computationally derived exon-exon junction sequences to search for.
seqExonsList <- list()
for(i in 1:length(slices)){
seqExons <- theoretical_seqs_df %>%
dplyr::slice(slices[[i]]) %>%
dplyr::pull(Sequence, name=Name) %>%
Biostrings::DNAStringSet()
#Add reverse compliment to search
revComp <- Biostrings::reverseComplement(seqExons)
names(revComp) <- paste0("revComp",names(seqExons))
#combine the sequences and the reverse complement into one DNAstringset
seqExons <- c(seqExons, revComp)
seqExons <- seqExons[order(names(seqExons))]
#convert into pdict
seqExons <- Biostrings::PDict(seqExons) #(3) later matchPdict can only be used with max.mismatch=0.???
seqExonsList[[i]] <- seqExons
rm(seqExons)
}
message(paste("Finished PDict processing with",sum(sapply(seqExonsList, length)),"theoretical fusion sequences."))
return(seqExonsList)
}
######### Sequence Pattern Matching
#' Sequence pattern matching of RNAseq reads and counts the number of breakpoint junction reads.
#'
#' @param Sample_ID character string of patient ID
#' @param pdict_obj pdict class object form biostrings.
#' @param RNAseqReads DNAstring set object with RNAseq reads to query
#' @param contigSeqs DNAstring set object with fusion algorithm (eg STAR-fusion) derived fusion contig sequences
#'
#' @return
#' @export
#'
#' @examples
#' \dontrun{
#' my_seqs <- data.frame(Name=paste0("seq",1:10),Sequence=rep(c("ATCGCCCGTTA"), 10))
#' my_files_df <- data.frame(Sample="PAZZUM.03A.01R", filepath=dir("/path/to/sample.bam"))
#'
#' my_contigs <- data.frame(contig="AATGGCTATC", Name=paste0("seq",1)) %>% pull(contig,name=Name)
#' my_contigs <- Biostrings::DNAStringSet(my_contigs)
#'
#' my_pdict_obj <- create_custom_pdict(theoretical_seqs_df=my_seqs)
#' my_bam_reads <- subset_bam("PAZZUM.03A.01R", breakpoint=list("16:1250000|16:1260000"), my_files_df)
#'
#' evidence <- match_reads_contigs("PAZZUM.03A.01R", my_pdict_obj, my_bam_reads, contigSeqs=my_contigs)
#' }
#' @import dplyr
match_reads_contigs <- function(Sample_ID, pdict_obj, RNAseqReads, contigSeqs=NULL){
#match RNA-seq reads (Error: cannot allocate vector of size 154.7 Gb)
read.matches <- Biostrings::vcountPDict(pdict = pdict_obj,
subject = RNAseqReads,
max.mismatch = 0, min.mismatch = 0,
with.indels = FALSE)
rownames(read.matches) <- names(pdict_obj)
#Create a tally of the RNAseq read hits.
#Add column for total number of RNA-seq reads searched!
read.match.df <- data.frame(Num_Junction_Breakpoint_Reads=rowSums(read.matches)) %>%
rownames_to_column("Theoretical_Exon_Junctions") %>%
mutate_at(vars(Theoretical_Exon_Junctions), ~gsub("revComp","", .)) %>%
group_by(Theoretical_Exon_Junctions) %>%
mutate(Num_Junction_Breakpoint_Reads=sum(Num_Junction_Breakpoint_Reads)) %>%
ungroup() %>%
filter(!duplicated(Theoretical_Exon_Junctions)) %>%
mutate(Num_RNAseq_Reads_Searched=length(RNAseqReads))
#Final Results table
rm(read.matches)
gc()
results <- tibble(Sample=rep(Sample_ID,
nrow(read.match.df))) %>%
bind_cols(.,read.match.df)
#Match contig sequences
if(!is.null(contigSeqs)){
contig.matches <- Biostrings::vcountPDict(pdict = pdict_obj,
subject = contigSeqs,
max.mismatch = 0, min.mismatch = 0,
with.indels = FALSE)
rownames(contig.matches) <- names(pdict_obj)
colnames(contig.matches) <- names(contigSeqs)
#Create a tally of the contig hits
contig.match.df <- data.frame(Num_Contig_Matches=rowSums(contig.matches)) %>%
rownames_to_column("Theoretical_Exon_Junctions") %>%
mutate_at(vars(Theoretical_Exon_Junctions), ~gsub("revComp","", .)) %>%
group_by(Theoretical_Exon_Junctions) %>%
mutate(Num_Contig_Matches=sum(Num_Contig_Matches)) %>%
ungroup() %>%
filter(!duplicated(Theoretical_Exon_Junctions))
#Also return the contig sequence itself
contig.seq.df <- data.frame(contig.matches) %>%
rownames_to_column("Theoretical_Exon_Junctions") %>%
mutate_at(vars(Theoretical_Exon_Junctions), ~gsub("revComp","", .)) %>%
#make seq and revcomp seq identical results (eg. a hit for seq or revcomp seq considered a hit)
group_by(Theoretical_Exon_Junctions) %>%
mutate_at(vars(matches("contig")), ~sum(.)) %>%
ungroup() %>%
#make into long format and substitue a hit ==1 to hit == contig.sequence
tidyr::gather(Contig.Name, Contig.Match, matches("^contig")) %>%
group_by(Contig.Name) %>%
mutate_at(vars(Contig.Match), ~case_when(
. == 0 ~ as.character(.),
. == 1 ~ as.character(contigSeqs[[unique(Contig.Name)]]))) %>%
group_by(Theoretical_Exon_Junctions, .add=TRUE) %>%
filter(!duplicated(Theoretical_Exon_Junctions)) %>%
ungroup() %>%
tidyr::spread(Contig.Name,Contig.Match) %>%
rename_all(~gsub("contig.", "Matching_Contig_for_Exon_Junctions_", .))
#Final Results table
rm(contig.matches)
gc()
results <- results %>%
left_join(., contig.match.df, by="Theoretical_Exon_Junctions") %>%
left_join(., contig.seq.df, by="Theoretical_Exon_Junctions")
}
return(results)
}
######## workflow
#' Workflow wrapper to extract RNAseq reads and search for custom breakpoint exon junction sequences.
#'
#' @param Sample_ID character string of patient ID
#' @param fusion_data dataframe with fusion results from CICERO, TransAbyss, and STAR-Fusion concatenated
#' @param breakpoint list class with breakpoint string
#' @param file_manifest dataframe with Sample_ID and the full file path of the BAM file.
#' @param Seqlens named numeric vector with chromosome lengths
#' @param fusion_theoretical_pdict a pdict obj with computationally derived exon junction sequences
#' @param cicero.contig a dataframe with contig sequences from CICERO output
#' @param TA.contigs a dataframe with contig sequences from TransAbyss output
#'
#' @return
#' @export
#'
#' @examples
#' \dontrun{
#' my_seqs <- data.frame(Name=paste0("seq",1:10),Sequence=rep(c("ATCGCCCGTTA"), 10))
#' my_files_df <- data.frame(Sample="PAZZUM.03A.01R", filepath=dir("/path/to/sample.bam"))
#' chr_lengths <- c("16"=24000000)
#'
#' my_pdict_obj <- create_custom_pdict(theoretical_seqs_df=my_seqs)
#'
#' examine_breakpoint_evidence("PAZZUM.03A.01R", breakpoint=list("16:1250000|16:1260000"),
#' my_files_df,chr_lengths, my_pdict_obj, cicero_contig_df, transabyss_contig_df)
#'}
#' @import dplyr
examine_breakpoint_evidence <- function(Sample_ID,
fusion_data=NULL,
breakpoint=NULL,
file_manifest,
Seqlens,
fusion_theoretical_pdict,
cicero.contig,
TA.contigs){
print(paste("Processing Sample:",Sample_ID))
#Begin subsetting the RNAseq BAM file reads
BAM <- subset_bam(Sample_ID = Sample_ID,
breakpoint=breakpoint,
fusion_data = fusion_data,
file_manifest = file_manifest,
scan = TRUE,
by_chr=TRUE,
Seqlens=Seqlens)
#Select only the RNAseq reads to keep in memory
seqs <- lapply(BAM, `[[`, "seq")
read_names <- lapply(BAM, `[[`, "qname")
print(paste("Subsetting BAM for the Ranges:", paste(names(seqs),collapse="; " )))
#for loop to concatentate the ranges into a single DNAstringset object.
RNAseq <- Biostrings::DNAStringSet() #unlist() doesnt work with the seqs object for some reason, and Ive tried all lapply/sapply possibities.
for(i in 1:length(seqs)){
reads <- seqs[[i]]
names(reads) <- read_names[[i]]
RNAseq <- Biostrings::DNAStringSet(c(RNAseq,reads))
}
rm(BAM)
message(paste("Finished subsetting BAM file for RNAseq read sequences.
There are",length(RNAseq),"RNAseq reads to search."))
#Contigs to Search
try(
cicero <- cicero.contig %>%
filter(grepl(Sample_ID,Sample)) %>%
mutate(Name=paste0("contig.cicero.",1:nrow(.))) %>%
pull(contig,name=Name) %>%
Biostrings::DNAStringSet(),
silent = T
)
#Contigs to Search
try(
TA <- TA.contigs %>%
filter(grepl(Sample_ID,Sample)) %>%
mutate(Name=paste0("contig.TA.",1:nrow(.))) %>%
pull(contig, name=Name) %>%
Biostrings::DNAStringSet(),
silent = T
)
#Combine all contigs
if(all(exists("TA") & exists("cicero"))){
contigs <- c(cicero,TA)
rm(TA,cicero)
}else if(all(exists("TA") & !exists("cicero"))){
contigs <- TA
rm(TA)
}else if(all(!exists("TA") & exists("cicero"))){
contigs <- c(cicero)
rm(cicero)
}
if(exists("contigs")){
message("Finished subsetting fusion algorithm contig sequences.")
}else{
message("No contig sequences found.")
contigs <- NULL
}
#Search for the computationally derived seqs
final_results <- NULL
for(i in 1:length(fusion_theoretical_pdict)){
theoretic_txs <- match_reads_contigs(Sample_ID=Sample_ID,
pdict_obj = fusion_theoretical_pdict[[i]],
RNAseqReads = RNAseq,
contigSeqs = contigs)
if(is.null(final_results)){
final_results <- theoretic_txs
}else{
final_results <- bind_rows(final_results,
theoretic_txs)
}
rm(theoretic_txs) #remove large objects from memory
message("Finished search for ", paste(i," out of", length(fusion_theoretical_pdict)),
" parts of theoretical sequences.")
}
message("Finished search all theoretical sequences.")
return(final_results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.