knitr::opts_knit$set( root.dir = file.path(PROJHOME,"2020.09.15_RNAseq_Fusion_Breakpoints")) library(fusBreakpoint) library(magrittr) library(dplyr) library(tibble) library(stringr)
knitr::opts_chunk$set( collapse = TRUE, comment = "#>", tidy.opts=list(width.cutoff=50), tidy=TRUE, fig.align='center', fig.width = 10, fig.height = 10 )
#Function to wrap up all the RSlurm submissions for the different fusion groups and fusion sequences. submit_slurm_per_fusion <- function(fusion.category, file_manifest, chromLengths, brkpointcol="breakpoint", cohort=""){ # The fusion group to investigate group <- fusion.category breakpoint <- common_breakpoints[group,brkpointcol] print(group) print(breakpoint) #Filter the sequence data for the fusion to investigate fusion_theoretical_input <- filter(theoretical_data.collapsed, Fusion.Category == group) cicero.contig_input <- filter(cicero.contig, Fusion.Category == group) TA.contigs_input <- filter(TA.contigs, Fusion.Category == group) # I forgot seqs need to be the same width for pdict objects. hopefully will change soon. try( mut20 <- create_custom_pdict(theoretical_seqs_df = filter(fusion_theoretical_input, grepl("Mutseq20", Name))), silent=T) try( mut25 <- create_custom_pdict(theoretical_seqs_df = filter(fusion_theoretical_input, !grepl("Mutseq20", Name))), silent = T) for (n in 1:2){ mutseq <- c("mut20","mut25")[n] try(pdict_obj <- get(mutseq), silent=T) if(exists("pdict_obj")){ #some groups do not have both types of data. samps <- as.list(pull(file_manifest, Sample)) jobname <- paste(gsub("-","_",group), cohort,mutseq, sep="_") message(paste0("submitting: ",jobname)) sjob <- rslurm::slurm_map(samps, examine_breakpoint_evidence, breakpoint = breakpoint, file_manifest = file_manifest, Seqlens = chromLengths, fusion_theoretical_pdict = pdict_obj, cicero.contig = cicero.contig_input, TA.contigs = TA.contigs_input, slurm_options=sopt, jobname=jobname, submit = T) rm(pdict_obj) } rm(mutseq) } }
merged <- read.csv(file.path(CDE,"Merged/TARGET_AML_0531_1031_merged_CDEs_9.18.20.csv")) merged <- merged %>% filter(USI != "Unknown") %>% as.data.frame() %>% set_rownames(.$USI) dim(merged)
IDmap <- read.csv("References/St.Jude_TARGET_CICERO_Sample_IDmap.csv") head(IDmap)
polyA_RNAseq_files <- read.csv("BAM_Manifests/TARGET_AML_polyA_RNAseq_Bam_Manifest_10.02.20.csv", row.names = 1) dim(polyA_RNAseq_files) table(duplicated(polyA_RNAseq_files$Sample)) #FALSE
RBD_RNAseq_files <- read.csv("BAM_Manifests/TARGET_AML_Ribodepleted_RNAseq_Bam_Manifest_10.02.20.csv",row.names = 1) dim(RBD_RNAseq_files) table(duplicated(RBD_RNAseq_files$Sample)) #FALSE
SJ_RNAseq_files <- read.csv("BAM_Manifests/St.Jude_AML_ALL_RNAseq_Bam_Manifest_10.09.20.csv") dim(SJ_RNAseq_files) head(SJ_RNAseq_files)
TA.contigs <- read.csv("Data/TransAbyss_Contig_Sequences.csv") # head(TA.contigs) dim(TA.contigs) #54,878 4
cicero.contig <- read.csv("Data/CICERO_Contig_Sequences.csv") # head(cicero.contig) dim(cicero.contig) #158,972
theoretical_data.collapsed <- read.csv("Data/Theoretical_Fusion_Breakpoint_Junctions_per_Transcript.csv") head(theoretical_data.collapsed) dim(theoretical_data.collapsed) # 1955 4
seqlens <- read.delim("Data/Grch37.lite_chrom_lengths.txt") %>% rownames_to_column("chr") %>% pull(x, name=chr) head(seqlens)
seqlens.ctat <- read.delim("Data/Grch37.ctat_chrom_lengths.txt") %>% rownames_to_column("chr") %>% pull(x, name=chr) head(seqlens.ctat)
Groups <- unique(theoretical_data.collapsed$Fusion.Category) Groups
I just need a breakpoint, but since I'll be searching for evidence in the RNAseq reads for the whole chromosome, don't need to exact breakpoint junction per patient. Just need a common one, such as
common_breakpoints <- read.csv("Data/TARGET_AML_Most_Common_RNAseq_Breakpoints.csv", row.names = 1) common_breakpoints
outdir <- file.path(SCRATCH,'jlsmith3/Fusion_Breakpoints') sopt <- list(nodes='1', 'cpus-per-task'='2', 'partition'='campus-new', 'mem'='40G', 'time' = '48:00:00', 'mail-type'='FAIL,END', 'mail-user'='jlsmith3@fredhutch.org')
setwd(outdir) jobs1 <- lapply(Groups, submit_slurm_per_fusion, file_manifest=polyA_RNAseq_files, chromLengths=seqlens, cohort="polyA_RNAseq")
setwd(outdir) RBD_RNAseq_files <- RBD_RNAseq_files %>% filter(grepl("AML|NBM|CD34_PB", Group), grepl("diagnostic|NBM|CD34_NBM", Time_point)) jobs2 <- lapply(Groups, submit_slurm_per_fusion, file_manifest=RBD_RNAseq_files, chromLengths=seqlens, cohort="RBD_RNAseq")
setwd(outdir) jobs3 <- lapply(Groups, submit_slurm_per_fusion, file_manifest=SJ_RNAseq_files, chromLengths=seqlens.ctat, brkpointcol="breakpoint_ctat", cohort="SJ_RNAseq")
library(furrr) future::plan("multisession") # future::plan("sequential") plan() availableCores()
setwd(outdir) for ( i in 1:length(Groups) ){ # The fusion group to investigate group <- Groups[i] breakpoint <- common_breakpoints[group,"breakpoint"] print(group) print(breakpoint) #Filter the sequence data for the fusion to investigate fusion_theoretical_input <- filter(theoretical_data.collapsed, Fusion.Category == group) cicero.contig_input <- filter(cicero.contig, Fusion.Category == group) TA.contigs_input <- filter(TA.contigs, Fusion.Category == group) # I forgot seqs need to be the same width for pdict objects. hopefully will change soon. try( mut20 <- create_custom_pdict(theoretical_seqs_df = filter(fusion_theoretical_input, grepl("Mutseq20", Name))), silent=T) try( mut25 <- create_custom_pdict(theoretical_seqs_df = filter(fusion_theoretical_input, !grepl("Mutseq20", Name))), silent = T) for (n in 1:2){ mutseq <- c("mut20","mut25")[n] try(pdict_obj <- get(mutseq), silent=T) fname <- paste0("Rdata/TARGET_AML_",group,"_polyA_RNAseq_",mutseq,".RDS") if(exists("pdict_obj")){ #some groups do not have both types of data. samps <- as.list(pull(polyA_RNAseq_files, Sample)) tictoc::tic() polyA_evidence <- furrr::future_map_dfr(samps, examine_breakpoint_evidence, breakpoint = breakpoint, file_manifest = polyA_RNAseq_files, Seqlens = seqlens, fusion_theoretical_pdict = pdict_obj, cicero.contig = cicero.contig_input, TA.contigs = TA.contigs_input, .progress=TRUE) tictoc::toc() #Save output to RDS data saveRDS(polyA_evidence, fname) #remove large objects from memory rm(pdict_obj) rm(polyA_evidence) gc() } } }
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.