Set-up

knitr::opts_knit$set(
  root.dir = file.path(PROJHOME,"2020.09.15_RNAseq_Fusion_Breakpoints"))

library(fusBreakpoint)
library(magrittr)
library(dplyr)
library(tibble)
library(stringr)
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  tidy.opts=list(width.cutoff=50),
  tidy=TRUE, 
  fig.align='center',
  fig.width = 10, 
  fig.height = 10
)

Define Functions

#Needs to be build into the package
source(file.path(SCRIPTS,"RNAseq_Analysis/fusBreakpoint/R/survplot_functions.R.bak"))

Read in ClinData

merged <- read.csv(file.path(CDE,"Merged/TARGET_AML_0531_1031_merged_CDEs_9.18.20.csv"))

merged <- merged %>%
  filter(!is.na(USI), USI != "Unknown") %>%
  set_rownames(.$USI)

inelig <- c(773920,775026, 786948,799528)
ineligables <- merged %>% 
  filter(Eligibility_Comments == "remove" | Reg. %in% inelig) %>% 
  pull(USI)

dim(merged)
IDmap <- read.csv("References/St.Jude_TARGET_CICERO_Sample_IDmap.csv")

head(IDmap)
SJCBF <- read.csv("References/SJ_CBF_AML_Clinical_Data.csv") %>% 
  mutate(Sample=str_split_fixed(ID, "\\/", n=2)[,1]) %>% 
  mutate(subject_name=str_split_fixed(Sample,"_", n=2)[,1]) %>%
  select(Sample,subject_name, ID, everything())

# head(SJCBF)
dim(SJCBF)

Read in the Theoretical Junc Sequences

theoretical_data.collapsed <- read.csv("Data/Theoretical_Fusion_Breakpoint_Junctions_per_Transcript.csv")

head(theoretical_data.collapsed)
dim(theoretical_data.collapsed) # 1955    4

Read in File Manifests

polyA_RNAseq_files <- read.csv("BAM_Manifests/TARGET_AML_polyA_RNAseq_Bam_Manifest_10.02.20.csv", 
                               row.names = 1) 


dim(polyA_RNAseq_files)
table(duplicated(polyA_RNAseq_files$Sample)) #FALSE
RBD_RNAseq_files <- read.csv("BAM_Manifests/TARGET_AML_Ribodepleted_RNAseq_Bam_Manifest_10.02.20.csv",row.names = 1)

dim(RBD_RNAseq_files)
table(duplicated(RBD_RNAseq_files$Sample)) #FALSE
SJ_RNAseq_files <- read.csv("BAM_Manifests/St.Jude_AML_ALL_RNAseq_Bam_Manifest_10.09.20.csv")

dim(SJ_RNAseq_files)
table(duplicated(SJ_RNAseq_files$Sample)) #FALSE

Define the Results files

outdir <- file.path(SCRATCH,'jlsmith3/Fusion_Breakpoints')
results_files <- dir(file.path(outdir), 
                 pattern = "results_[01].RDS",
                 recursive = T, 
                 full.names = T)

Names <- str_split_fixed(results_files, "\\/", n=9)[,8] 
Names <- str_split_fixed(Names,"_", n=6)[,3:4]
Names <- paste(as.character(Names[,1]),
               as.character(Names[,2]), sep=".")
names(results_files) <- Names

length(results_files) # 62
head(results_files)
table(names(results_files))

Save TARGET AML Results

TARGET_results <- grep("polyA_|RBD_", results_files, value = T)

table(names(TARGET_results))
length(TARGET_results)
cat_results <- lapply(seq(2,length(TARGET_results), by=2), 
                      function(i){

                #select the results files  (in two parts)
                rds0 <- TARGET_results[i-1]
                rds1 <- TARGET_results[i]
                # print(c(rds0,rds1))

                #define output filename and destination
                fname <- str_split_fixed(rds0,"\\/", n=9)[,8] %>% 
                  gsub("_rslurm_", "", .) %>% 
                  paste0("TARGET_AML_",., "_breakpoint_evidence.csv")

                fusion <- unique(c(names(rds0),names(rds1)))
                print(fusion)
                destination <- paste0(getwd(), "/", fusion, "_Results")
                suppressWarnings(dir.create(destination,recursive = T))

                #read in the results
                res <- purrr::map_dfr(c(rds0, rds1), function(x) bind_rows(readRDS(x)))

                # RNAseq batch annotations
                if(grepl("polyA", rds0)){
                  res <- res %>%
                      left_join(., select(polyA_RNAseq_files, -filename, -filepath),
                              by="Sample") 
                }else{
                  res <- res %>%
                      left_join(., select(RBD_RNAseq_files, -filename, -filepath),
                                by="Sample") 
                }

                #merge in clinical annotations
                res <- res %>%
                     left_join(., select(merged, USI,
                                            matches("OS\\.[tei]|EFS\\.[tei]|Event",ignore.case=FALSE)),
                                  by="USI") %>%
                      mutate_at(vars(Primary.Fusion), ~ifelse(grepl("NBM|PB", Group), Group, .)) %>% 
                      select(Sample,USI,Protocol,
                               Primary.Fusion,Additional.Fusions.CNV,
                               Theoretical_Exon_Junctions,
                              Num_Junction_Breakpoint_Reads,
                              Num_RNAseq_Reads_Searched,
                              Lib_Prep:AML_Subtype, everything(),
                             -Reg., -Final_Patient_ID, -PATIENT_ID_Original)

                print(paste(destination, fname, sep="/"))
                write.csv(res,paste(destination, fname, sep="/"), row.names = FALSE)

              })

Analysis of Breakpoint Evidence

CBFB-MYH11

cbfb_mut20_RBD <- read.csv("CBFB.MYH11_Results/TARGET_AML_CBFB_MYH11_RBD_RNAseq_mut20_breakpoint_evidence.csv") %>% 
  mutate(Exons=gsub(".+Exons([0-9]_[0-9]{1,2})_.+", "\\1", Theoretical_Exon_Junctions)) %>%
  dplyr::select(Sample:Additional.Fusions.CNV, Exons,Num_Junction_Breakpoint_Reads, everything())

head(cbfb_mut20_RBD)
length(unique(cbfb_mut20_RBD$Sample)) #1617
cbfb_mut20_polyA <- read.csv("CBFB.MYH11_Results/TARGET_AML_CBFB_MYH11_polyA_RNAseq_mut20_breakpoint_evidence.csv") %>% 
  mutate(Exons=gsub(".+Exons([0-9]_[0-9]{1,2})_.+", "\\1", Theoretical_Exon_Junctions)) %>%
  dplyr::select(Sample:Additional.Fusions.CNV, Exons,Num_Junction_Breakpoint_Reads, everything())

head(cbfb_mut20_polyA)
length(unique(cbfb_mut20_polyA$Sample)) #508
exon_to_genomicLocus <- xlsx::read.xlsx("References/HutchFusion_neoSplice_CBFB.xlsx", sheetIndex = 1) %>% 
  select(Exons,Isoform,chrA,posA,chrB,posB,Mut20) %>%
  distinct() %>%
  rowwise() %>% 
  mutate(Breakpoint=paste(paste(chrA,posA, sep=":"), 
                           paste(chrB,posB, sep = ":"),
                           sep="|")) %>% 
  ungroup() %>% 
  group_by(Exons,Mut20) %>%
  mutate(Breakpoint=paste(Breakpoint, collapse = "; ")) %>% 
  arrange(Exons) %>% 
  select(Exons,Isoform,Mut20, Breakpoint) %>% 
  distinct()

# exon_to_genomicLocus
# unique(exon_to_genomicLocus$Mut20) #5
cbfb_all <- cbfb_mut20_RBD %>% 
  bind_rows(., cbfb_mut20_polyA) %>% 
  # filter(!USI %in% ineligables) %>%

  group_by(Sample,Lib_Prep) %>%
  mutate(CBFB_MYH11_True_Positive=case_when(
   (grepl("CBFB-MYH11", Primary.Fusion) | grepl("CBFB-MYH11", Additional.Fusions.CNV)) & any(Num_Junction_Breakpoint_Reads > 0) ~ "TP",
   !(grepl("CBFB-MYH11", Primary.Fusion) | grepl("CBFB-MYH11", Additional.Fusions.CNV)) & any(Num_Junction_Breakpoint_Reads > 0) ~ "FP", 
   (grepl("CBFB-MYH11", Primary.Fusion) | grepl("CBFB-MYH11", Additional.Fusions.CNV)) & any(Num_Junction_Breakpoint_Reads == 0) ~ "FN", 
   !(grepl("CBFB-MYH11", Primary.Fusion) | grepl("CBFB-MYH11", Additional.Fusions.CNV)) & any(Num_Junction_Breakpoint_Reads == 0) ~ "TN")) %>% 

  ungroup() %>%
  left_join(., exon_to_genomicLocus, by="Exons") %>% 
  dplyr::select(Sample:Additional.Fusions.CNV,CBFB_MYH11_True_Positive, everything())


# dim(cbfb_all) #10625    34
# length(unique(cbfb_all$Sample)) #1967 (1961 without ineligables)
head(cbfb_all)

Check with the original

cbfb_res_orig <- xlsx::read.xlsx("CBFB.MYH11_Results/Theoretical_data/TARGET_0531_1031.CBFB-MYH11.FusionBreakpoint.Unmasked.xlsx", sheetIndex = 1)

# head(cbfb_res_orig)
dim(cbfb_res_orig)#152   4
length(unique(cbfb_res_orig$USI))
check <- cbfb_res_orig  %>% 
  inner_join(., cbfb_all,
             by="USI") %>%
  select(Sample:Exons,CBFB_MYH11_True_Positive,
         Fusion_Breakpoint_Unmasked, Fusion_Breakpoint, everything()) %>%

  group_by(Sample, Lib_Prep) %>%
  mutate(Keep=case_when(
    Num_Junction_Breakpoint_Reads > 0 ~ TRUE, 
    Num_Junction_Breakpoint_Reads == 0 & CBFB_MYH11_True_Positive == "FN" ~ !duplicated(Sample),
    TRUE ~ FALSE)) %>%
  ungroup() %>%
  # filter(Num_Junction_Breakpoint_Reads > 0) %>%

  # mutate(Different=ifelse(Exons==Fusion_Breakpoint_Unmasked, "No", "Yes")) %>%
  # arrange(desc(Fusion_Breakpoint), Fusion_Breakpoint_Unmasked, USI) %>%
  select(Keep, Sample:Additional.Fusions.CNV,Lib_Prep,
         # Different,
         everything())

check

# table(check$Different, useNA='ifany')
# table(duplicated(check$USI))
# write.csv(check,"CBFB_MYH11_Check.csv", row.names = FALSE)
toFill <- cbfb_all %>% 
  filter(CBFB_MYH11_True_Positive=="TP", Num_Junction_Breakpoint_Reads >0) %>% 
  filter(! USI %in% cbfb_res_orig$USI) %>% 
  select(Sample,USI, Exons,Isoform,Mut20,Breakpoint)

# toFill
# dim(toFill)
# write.csv(toFill,"CBFB.MYH11_Results/Theoretical_data/TARGET_AML_CBFB-MYH11_RNAseq_mut20_bySeq_Search.csv", row.names = FALSE)

Specificty and Sensitivity

cbfb_all %>% 
  select(Sample,CBFB_MYH11_True_Positive) %>% 
  unique() %>% 
  group_by(CBFB_MYH11_True_Positive) %>% 
  summarise(N=n()) %>% 
  ungroup()
FN_all <- filter(cbfb_all, CBFB_MYH11_True_Positive == "FN") %>% 
  arrange(Protocol) %>% 
  dplyr::select(Sample,USI,Protocol, Lib_Prep,
                Primary.Fusion, Additional.Fusions.CNV, CBFB_MYH11_True_Positive)  %>% 
  unique()

FN_all #13 False negatives (not so great)

# table(FN_all$USI %in% ineligables)
# table(FN_all$USI %in% cbfb_res_orig$USI)

TARGET.20.PARAHF.09A.01R #Low depth TARGET.20.PASGKA.09A.01R # Low depth TARGET.20.PASVVM.09A.01R #low depth

TARGET.20.PAURKG.09A.01R #RBD TARGET.20.PAWHTL.03A.01R #RBD TARGET.20.PAWNIK.09A.01R #RBD

ST.Jude Data

#Need STAR fusion data in order to see how many Inv.16 were detected. 
star.fmt.primary <- read.csv(file.path("References/St.Jude_AML_ALL_STAR_Fusion_reformatted_FilteredForNBM_PrimaryFusions_10.12.20.csv")) %>% 
  left_join(., SJ_RNAseq_files, by="Sample")

dim(star.fmt.primary)
head(star.fmt.primary)
length(unique(star.fmt.primary$Sample))

table(SJ_RNAseq_files$sj_diseases)

dim(SJ_RNAseq_files)
CBF.JS <- star.fmt.primary %>% 
  filter(grepl("CBFB-MYH11", Fusion.Category) | sj_diseases == "CBF") %>% 
  select(Sample,X.Fusion,sj_diseases) %>% 
  arrange(X.Fusion)


CBF.JS %>% 
  filter(grepl("CBFB-MYH11", X.Fusion))
SJ_results <- grep("_SJ_", results_files, value = T)
# SJresults
table(names(SJ_results))
cat_results <- lapply(seq(2,length(SJ_results), by=2), 
                      function(i){

                #select the results files  (in two parts)
                rds0 <- SJ_results[i-1]
                rds1 <- SJ_results[i]
                # print(c(rds0,rds1))

                #define output filename and destination
                fname <- str_split_fixed(rds0,"\\/", n=9)[,8] %>% 
                  gsub("_rslurm_", "", .) %>% 
                  paste0("St.Jude_AML_ALL_",., "_breakpoint_evidence.csv")

                fusion <- unique(c(names(rds0),names(rds1)))
                print(fusion)
                destination <- paste0(getwd(), "/", fusion, "_Results")
                # suppressWarnings(dir.create(destination,recursive = T))

                #read in the results
                res <- purrr::map_dfr(c(rds0, rds1), function(x) bind_rows(readRDS(x)))

                # RNAseq batch annotations
                res <- res %>%
                      left_join(., select(SJ_RNAseq_files, -filename, -filepath),
                              by="Sample")


                print(paste(destination, fname, sep="/"))
                write.csv(res,paste(destination, fname, sep="/"), row.names = FALSE)

})
SJ.CBFB <- read.csv("CBFB.MYH11_Results/St.Jude_AML_ALL_CBFB_MYH11_SJ_RNAseq_mut20_breakpoint_evidence.csv")

head(SJ.CBFB)
filter(SJ.CBFB, Num_Junction_Breakpoint_Reads > 0 ) %>% 
  filter(!duplicated(Sample))

Session Information

sessionInfo()


jennylsmith/fusBreakpoint documentation built on Oct. 7, 2021, 8:04 p.m.