knitr::opts_knit$set( root.dir = file.path(PROJHOME,"2020.09.15_RNAseq_Fusion_Breakpoints")) library(fusBreakpoint) library(magrittr) library(dplyr) library(tibble) library(stringr)
knitr::opts_chunk$set( collapse = TRUE, comment = "#>", tidy.opts=list(width.cutoff=50), tidy=TRUE, fig.align='center', fig.width = 10, fig.height = 10 )
#Needs to be build into the package source(file.path(SCRIPTS,"RNAseq_Analysis/fusBreakpoint/R/survplot_functions.R.bak"))
merged <- read.csv(file.path(CDE,"Merged/TARGET_AML_0531_1031_merged_CDEs_9.18.20.csv")) merged <- merged %>% filter(!is.na(USI), USI != "Unknown") %>% set_rownames(.$USI) inelig <- c(773920,775026, 786948,799528) ineligables <- merged %>% filter(Eligibility_Comments == "remove" | Reg. %in% inelig) %>% pull(USI) dim(merged)
IDmap <- read.csv("References/St.Jude_TARGET_CICERO_Sample_IDmap.csv") head(IDmap)
SJCBF <- read.csv("References/SJ_CBF_AML_Clinical_Data.csv") %>% mutate(Sample=str_split_fixed(ID, "\\/", n=2)[,1]) %>% mutate(subject_name=str_split_fixed(Sample,"_", n=2)[,1]) %>% select(Sample,subject_name, ID, everything()) # head(SJCBF) dim(SJCBF)
theoretical_data.collapsed <- read.csv("Data/Theoretical_Fusion_Breakpoint_Junctions_per_Transcript.csv") head(theoretical_data.collapsed) dim(theoretical_data.collapsed) # 1955 4
polyA_RNAseq_files <- read.csv("BAM_Manifests/TARGET_AML_polyA_RNAseq_Bam_Manifest_10.02.20.csv", row.names = 1) dim(polyA_RNAseq_files) table(duplicated(polyA_RNAseq_files$Sample)) #FALSE
RBD_RNAseq_files <- read.csv("BAM_Manifests/TARGET_AML_Ribodepleted_RNAseq_Bam_Manifest_10.02.20.csv",row.names = 1) dim(RBD_RNAseq_files) table(duplicated(RBD_RNAseq_files$Sample)) #FALSE
SJ_RNAseq_files <- read.csv("BAM_Manifests/St.Jude_AML_ALL_RNAseq_Bam_Manifest_10.09.20.csv") dim(SJ_RNAseq_files) table(duplicated(SJ_RNAseq_files$Sample)) #FALSE
outdir <- file.path(SCRATCH,'jlsmith3/Fusion_Breakpoints')
results_files <- dir(file.path(outdir), pattern = "results_[01].RDS", recursive = T, full.names = T) Names <- str_split_fixed(results_files, "\\/", n=9)[,8] Names <- str_split_fixed(Names,"_", n=6)[,3:4] Names <- paste(as.character(Names[,1]), as.character(Names[,2]), sep=".") names(results_files) <- Names length(results_files) # 62 head(results_files) table(names(results_files))
TARGET_results <- grep("polyA_|RBD_", results_files, value = T) table(names(TARGET_results)) length(TARGET_results)
cat_results <- lapply(seq(2,length(TARGET_results), by=2), function(i){ #select the results files (in two parts) rds0 <- TARGET_results[i-1] rds1 <- TARGET_results[i] # print(c(rds0,rds1)) #define output filename and destination fname <- str_split_fixed(rds0,"\\/", n=9)[,8] %>% gsub("_rslurm_", "", .) %>% paste0("TARGET_AML_",., "_breakpoint_evidence.csv") fusion <- unique(c(names(rds0),names(rds1))) print(fusion) destination <- paste0(getwd(), "/", fusion, "_Results") suppressWarnings(dir.create(destination,recursive = T)) #read in the results res <- purrr::map_dfr(c(rds0, rds1), function(x) bind_rows(readRDS(x))) # RNAseq batch annotations if(grepl("polyA", rds0)){ res <- res %>% left_join(., select(polyA_RNAseq_files, -filename, -filepath), by="Sample") }else{ res <- res %>% left_join(., select(RBD_RNAseq_files, -filename, -filepath), by="Sample") } #merge in clinical annotations res <- res %>% left_join(., select(merged, USI, matches("OS\\.[tei]|EFS\\.[tei]|Event",ignore.case=FALSE)), by="USI") %>% mutate_at(vars(Primary.Fusion), ~ifelse(grepl("NBM|PB", Group), Group, .)) %>% select(Sample,USI,Protocol, Primary.Fusion,Additional.Fusions.CNV, Theoretical_Exon_Junctions, Num_Junction_Breakpoint_Reads, Num_RNAseq_Reads_Searched, Lib_Prep:AML_Subtype, everything(), -Reg., -Final_Patient_ID, -PATIENT_ID_Original) print(paste(destination, fname, sep="/")) write.csv(res,paste(destination, fname, sep="/"), row.names = FALSE) })
cbfb_mut20_RBD <- read.csv("CBFB.MYH11_Results/TARGET_AML_CBFB_MYH11_RBD_RNAseq_mut20_breakpoint_evidence.csv") %>% mutate(Exons=gsub(".+Exons([0-9]_[0-9]{1,2})_.+", "\\1", Theoretical_Exon_Junctions)) %>% dplyr::select(Sample:Additional.Fusions.CNV, Exons,Num_Junction_Breakpoint_Reads, everything()) head(cbfb_mut20_RBD) length(unique(cbfb_mut20_RBD$Sample)) #1617
cbfb_mut20_polyA <- read.csv("CBFB.MYH11_Results/TARGET_AML_CBFB_MYH11_polyA_RNAseq_mut20_breakpoint_evidence.csv") %>% mutate(Exons=gsub(".+Exons([0-9]_[0-9]{1,2})_.+", "\\1", Theoretical_Exon_Junctions)) %>% dplyr::select(Sample:Additional.Fusions.CNV, Exons,Num_Junction_Breakpoint_Reads, everything()) head(cbfb_mut20_polyA) length(unique(cbfb_mut20_polyA$Sample)) #508
exon_to_genomicLocus <- xlsx::read.xlsx("References/HutchFusion_neoSplice_CBFB.xlsx", sheetIndex = 1) %>% select(Exons,Isoform,chrA,posA,chrB,posB,Mut20) %>% distinct() %>% rowwise() %>% mutate(Breakpoint=paste(paste(chrA,posA, sep=":"), paste(chrB,posB, sep = ":"), sep="|")) %>% ungroup() %>% group_by(Exons,Mut20) %>% mutate(Breakpoint=paste(Breakpoint, collapse = "; ")) %>% arrange(Exons) %>% select(Exons,Isoform,Mut20, Breakpoint) %>% distinct() # exon_to_genomicLocus # unique(exon_to_genomicLocus$Mut20) #5
cbfb_all <- cbfb_mut20_RBD %>% bind_rows(., cbfb_mut20_polyA) %>% # filter(!USI %in% ineligables) %>% group_by(Sample,Lib_Prep) %>% mutate(CBFB_MYH11_True_Positive=case_when( (grepl("CBFB-MYH11", Primary.Fusion) | grepl("CBFB-MYH11", Additional.Fusions.CNV)) & any(Num_Junction_Breakpoint_Reads > 0) ~ "TP", !(grepl("CBFB-MYH11", Primary.Fusion) | grepl("CBFB-MYH11", Additional.Fusions.CNV)) & any(Num_Junction_Breakpoint_Reads > 0) ~ "FP", (grepl("CBFB-MYH11", Primary.Fusion) | grepl("CBFB-MYH11", Additional.Fusions.CNV)) & any(Num_Junction_Breakpoint_Reads == 0) ~ "FN", !(grepl("CBFB-MYH11", Primary.Fusion) | grepl("CBFB-MYH11", Additional.Fusions.CNV)) & any(Num_Junction_Breakpoint_Reads == 0) ~ "TN")) %>% ungroup() %>% left_join(., exon_to_genomicLocus, by="Exons") %>% dplyr::select(Sample:Additional.Fusions.CNV,CBFB_MYH11_True_Positive, everything()) # dim(cbfb_all) #10625 34 # length(unique(cbfb_all$Sample)) #1967 (1961 without ineligables) head(cbfb_all)
cbfb_res_orig <- xlsx::read.xlsx("CBFB.MYH11_Results/Theoretical_data/TARGET_0531_1031.CBFB-MYH11.FusionBreakpoint.Unmasked.xlsx", sheetIndex = 1) # head(cbfb_res_orig) dim(cbfb_res_orig)#152 4 length(unique(cbfb_res_orig$USI))
check <- cbfb_res_orig %>% inner_join(., cbfb_all, by="USI") %>% select(Sample:Exons,CBFB_MYH11_True_Positive, Fusion_Breakpoint_Unmasked, Fusion_Breakpoint, everything()) %>% group_by(Sample, Lib_Prep) %>% mutate(Keep=case_when( Num_Junction_Breakpoint_Reads > 0 ~ TRUE, Num_Junction_Breakpoint_Reads == 0 & CBFB_MYH11_True_Positive == "FN" ~ !duplicated(Sample), TRUE ~ FALSE)) %>% ungroup() %>% # filter(Num_Junction_Breakpoint_Reads > 0) %>% # mutate(Different=ifelse(Exons==Fusion_Breakpoint_Unmasked, "No", "Yes")) %>% # arrange(desc(Fusion_Breakpoint), Fusion_Breakpoint_Unmasked, USI) %>% select(Keep, Sample:Additional.Fusions.CNV,Lib_Prep, # Different, everything()) check # table(check$Different, useNA='ifany') # table(duplicated(check$USI)) # write.csv(check,"CBFB_MYH11_Check.csv", row.names = FALSE)
toFill <- cbfb_all %>% filter(CBFB_MYH11_True_Positive=="TP", Num_Junction_Breakpoint_Reads >0) %>% filter(! USI %in% cbfb_res_orig$USI) %>% select(Sample,USI, Exons,Isoform,Mut20,Breakpoint) # toFill # dim(toFill) # write.csv(toFill,"CBFB.MYH11_Results/Theoretical_data/TARGET_AML_CBFB-MYH11_RNAseq_mut20_bySeq_Search.csv", row.names = FALSE)
cbfb_all %>% select(Sample,CBFB_MYH11_True_Positive) %>% unique() %>% group_by(CBFB_MYH11_True_Positive) %>% summarise(N=n()) %>% ungroup()
FN_all <- filter(cbfb_all, CBFB_MYH11_True_Positive == "FN") %>% arrange(Protocol) %>% dplyr::select(Sample,USI,Protocol, Lib_Prep, Primary.Fusion, Additional.Fusions.CNV, CBFB_MYH11_True_Positive) %>% unique() FN_all #13 False negatives (not so great) # table(FN_all$USI %in% ineligables) # table(FN_all$USI %in% cbfb_res_orig$USI)
TARGET.20.PARAHF.09A.01R #Low depth TARGET.20.PASGKA.09A.01R # Low depth TARGET.20.PASVVM.09A.01R #low depth
TARGET.20.PAURKG.09A.01R #RBD TARGET.20.PAWHTL.03A.01R #RBD TARGET.20.PAWNIK.09A.01R #RBD
#Need STAR fusion data in order to see how many Inv.16 were detected. star.fmt.primary <- read.csv(file.path("References/St.Jude_AML_ALL_STAR_Fusion_reformatted_FilteredForNBM_PrimaryFusions_10.12.20.csv")) %>% left_join(., SJ_RNAseq_files, by="Sample") dim(star.fmt.primary) head(star.fmt.primary) length(unique(star.fmt.primary$Sample))
table(SJ_RNAseq_files$sj_diseases) dim(SJ_RNAseq_files)
CBF.JS <- star.fmt.primary %>% filter(grepl("CBFB-MYH11", Fusion.Category) | sj_diseases == "CBF") %>% select(Sample,X.Fusion,sj_diseases) %>% arrange(X.Fusion) CBF.JS %>% filter(grepl("CBFB-MYH11", X.Fusion))
SJ_results <- grep("_SJ_", results_files, value = T) # SJresults table(names(SJ_results))
cat_results <- lapply(seq(2,length(SJ_results), by=2), function(i){ #select the results files (in two parts) rds0 <- SJ_results[i-1] rds1 <- SJ_results[i] # print(c(rds0,rds1)) #define output filename and destination fname <- str_split_fixed(rds0,"\\/", n=9)[,8] %>% gsub("_rslurm_", "", .) %>% paste0("St.Jude_AML_ALL_",., "_breakpoint_evidence.csv") fusion <- unique(c(names(rds0),names(rds1))) print(fusion) destination <- paste0(getwd(), "/", fusion, "_Results") # suppressWarnings(dir.create(destination,recursive = T)) #read in the results res <- purrr::map_dfr(c(rds0, rds1), function(x) bind_rows(readRDS(x))) # RNAseq batch annotations res <- res %>% left_join(., select(SJ_RNAseq_files, -filename, -filepath), by="Sample") print(paste(destination, fname, sep="/")) write.csv(res,paste(destination, fname, sep="/"), row.names = FALSE) })
SJ.CBFB <- read.csv("CBFB.MYH11_Results/St.Jude_AML_ALL_CBFB_MYH11_SJ_RNAseq_mut20_breakpoint_evidence.csv") head(SJ.CBFB)
filter(SJ.CBFB, Num_Junction_Breakpoint_Reads > 0 ) %>% filter(!duplicated(Sample))
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.