R/0_exec_s08_vaf_scatterplot_cfdna_wbc.R

#==================================================
# David Brown
# brownd7@mskcc.org
#==================================================
rm(list=ls(all=TRUE))
source('config.R')

if (!dir.exists("../res/figureS8")) {
	dir.create("../res/figureS8")
}

if (!dir.exists("../res/etc/Source_Data_Extended_Data_Fig_6")) {
	dir.create("../res/etc/Source_Data_Extended_Data_Fig_6")
}

#==================================================
# barplot of mutation burden and sources of mutation
# per patient and cohort including hypermutators
#==================================================
clinical = read_tsv(file=clinical_file, col_types = cols(.default = col_character())) %>%
		   type_convert() %>%
		   mutate(subj_type = ifelse(subj_type == "Healthy", "Control", subj_type))
		   
snv_vars = read_tsv(snv_file$scored, col_types = cols(.default = col_character())) %>%
		   type_convert() %>%
		   mutate(level_2a = as.character(level_2a)) %>%
		   mutate(level_r1 = as.character(level_r1))
		   
indel_vars = read_tsv(indel_file$scored, col_types = cols(.default = col_character())) %>%
			 type_convert() %>%
			 mutate(level_2a = as.character(level_2a)) %>%
		     mutate(level_r1 = as.character(level_r1))

wbc_stack = read_tsv(wbc_variants$scored, col_types = cols(.default = col_character())) %>%
			type_convert()
			
msk_anno = read_tsv(msk_anno_joined, col_types = cols(.default = col_character())) %>%
  		   type_convert()
			
tracker_grail = read_csv(file=patient_tracker)

tracker_impact = read_csv(file=impact_tracker)

bed_file = rtracklayer::import.bed(con=common_bed)
bed_ranges = GenomicRanges::ranges(bed_file)
total_bed_Mb = round(sum(GenomicRanges::width(bed_ranges)) / 1e6, 1)

valid_patient_ids = tracker_grail %>%
					filter(patient_id %in% tracker_impact$patient_id) %>%
					.[["patient_id"]]
  
indel_vars = indel_vars %>%
			 mutate(filter = replace(filter,
             		patient_id == "MSK-VB-0001" &
             		gene == "GATA3" &
             		filter == "PASS",
             		"CSR_MATCH_ELIMINATED"),
         	 		ccd = replace(ccd,
             			   		  patient_id == "MSK-VB-0001" &
                           		  gene == "GATA3" &
                           		  filter == "CSR_MATCH_ELIMINATED",
                           		  0))

snv_plasma = snv_vars %>%
  			 filter(ccd == 1,
         			(c_panel == 1 | panel == 1),
         			study == "TechVal",
         			grail == 1,
         			patient_id %in% valid_patient_ids) %>%
			 mutate(vtype = "SNV")

indel_plasma = indel_vars %>%
			   filter(ccd == 1,
         			  (c_panel == 1 | panel == 1),
         			  study == "TechVal",
         			  grail == 1,
         			  patient_id %in% valid_patient_ids) %>%
  			   mutate(vtype = "INDEL",
         			  altenddistmedian = as.integer(altenddistmedian))
         			  
healthy_snv = snv_vars %>%
  			  filter((c_panel == 1 | panel == 1),
         			  subj_type == "Healthy",
         			  grail == 1) %>%
  			  mutate(vtype = "SNV")

healthy_indel = indel_vars %>%
  				filter((c_panel == 1 | panel == 1),
         			    subj_type == "Healthy",
         				grail == 1) %>%
  				mutate(vtype = "INDEL",
         			   altenddistmedian = as.integer(altenddistmedian))

small_vars_plasma = full_join(snv_plasma, indel_plasma) %>%
					full_join(healthy_snv) %>%
					full_join(healthy_indel)
small_vars_plasma = small_vars_plasma %>%
  					mutate(subj_type = ifelse(subj_type == "Healthy", "Control", subj_type))
  					
small_vars_plasma = small_vars_plasma %>%
					mutate(loc = str_c(chrom, ":", position_orig, "_", ref_orig, ">", alt_orig))  					

variants = label_bio_source(small_vars_plasma)

variants = left_join(variants, msk_anno %>% dplyr::select(patient_id, chrom, position, ref, alt, CASE:complex_indel_duplicate))
variants = variants %>%
		   mutate(bio_source = case_when(
		   					   MSK == 1 & grail == 1 ~ "biopsy_matched",
		   					   MSK == 1 & grail == 0 ~ "biopsy_only",
		   					   category %in% c("artifact", "edge", "low_depth", "low_qual") ~ "noise",
		   					   category %in% c("germline", "germlineish") ~ "germline",
		   					   category %in% c("blood", "bloodier") ~ "WBC_matched",
		   					   category == "somatic" & `IM-T.alt_count` > bam_read_count_cutoff ~ "IMPACT-BAM_matched",
		   					   category == "somatic" ~ "VUSo",
		   					   TRUE ~ "other"),
		   		  af = ifelse(is.na(af), 0, af),
		   		  af_nobaq = round(adnobaq / dpnobaq * 100, 2),
		   		  af_nobaq = ifelse(is.na(af_nobaq), 0, af_nobaq))

per_pat_smry = variants %>%
			   filter(!(bio_source %in% c("noise","germline","other")), is_nonsyn) %>%
			   group_by(subj_type, patient_id, bio_source) %>%
			   dplyr::summarize(num = n()) %>%
			   ungroup
  
per_pat_smry_all = variants %>%
				   filter(!(bio_source %in% c("noise","germline","other")), is_nonsyn) %>%
				   group_by(subj_type, patient_id) %>%
				   dplyr::summarize(all = n()) %>%
				   ungroup

per_pat_smry = left_join(per_pat_smry, per_pat_smry_all)

all_patient_table = small_vars_plasma %>%
					distinct(subj_type, patient_id)
all_patient_table = cbind.data.frame(subj_type = rep(all_patient_table$subj_type, 4),
                                     patient_id = rep(all_patient_table$patient_id, 4),
                                     bio_source = rep(c("WBC_matched",
                                                        "VUSo",
                                                        "biopsy_matched",
                                                        "IMPACT-BAM_matched"),
                                                      each = nrow(all_patient_table)))
per_pat_smry = left_join(all_patient_table, per_pat_smry)
per_pat_smry[is.na(per_pat_smry)] = 0
per_pat_smry = per_pat_smry %>%
			   mutate(orig_num = num, num = round(num / total_bed_Mb, 0))
cohort_stats = per_pat_smry %>%
			   group_by(subj_type, bio_source) %>%
			   dplyr::summarize(cohort_size = n(), positive_scored = sum(num > 0), percent_of_samples = 100 * sum(num > 0) / n()) %>%
			   ungroup()

export_x = NULL
pdf(file="../res/figureS8/barplot_bio_sources_hyper.pdf", width=12, height=9)
par(mar = c(6.1, 6, 4.1, 1))
zz = split.screen(figs=matrix(c(0.05,.55,.43,.93, .46,.96,.43,.93, 0.05,.55,0.09,.59, .46,.96,0.09,.59, 0,1,0,1), nrow=5, ncol=4, byrow=TRUE))
subj = c("Control", "Breast", "Lung", "Prostate")
for (i in 1:length(subj)) {
	screen(zz[i])
  	subj_smry = per_pat_smry %>%
  				filter(subj_type == subj[i])
  
  	wbc_pats = subj_smry %>%
  			   filter(bio_source == "WBC_matched") %>%
  			   dplyr::select(patient_id, num, all)
  
  	ordered_patient_ids = wbc_pats %>%
  						  arrange(num, all) %>%
  						  .[["patient_id"]]
  
  	subj_smry = subj_smry %>%
  			    mutate(num = ifelse(bio_source == "WBC_matched", -num, num),
  			  		   patient_id = factor(patient_id, levels = ordered_patient_ids),
  			  		   bio_source = factor(bio_source, levels = c("WBC_matched", "biopsy_matched", "VUSo", "IMPACT-BAM_matched")))
  
  	wbc_matched = subj_smry %>%
  				  filter(bio_source=="WBC_matched")
  	vuso = subj_smry %>%
  		   filter(bio_source=="VUSo")
  	biopsy_matched = subj_smry %>%
  		   		     filter(bio_source=="biopsy_matched")
  	biopsy_subthreshold = subj_smry %>%
  		   		     	  filter(bio_source=="IMPACT-BAM_matched")
  	
  	index = order(biopsy_matched[,"num"]+vuso[,"num"])
  	wbc_matched = wbc_matched[index,,drop=FALSE]
  	vuso = vuso[index,,drop=FALSE]
  	biopsy_matched = biopsy_matched[index,,drop=FALSE]
  	biopsy_subthreshold = biopsy_subthreshold[index,,drop=FALSE]
  	
  	index = order(wbc_matched[,"num"], decreasing=TRUE)
  	wbc_matched = wbc_matched[index,,drop=FALSE]
  	vuso = vuso[index,,drop=FALSE]
  	biopsy_matched = biopsy_matched[index,,drop=FALSE]
  	biopsy_subthreshold = biopsy_subthreshold[index,,drop=FALSE]
  	
  	export_x = rbind(export_x,
  					 wbc_matched,
  					 vuso,
  					 biopsy_matched,
  					 biopsy_subthreshold)
  	
  	zzz = barplot(wbc_matched[,"num"]-.9, col=variant_cols["WBC_matched"], border="black", space=.16, axes=FALSE, ylim=c(-30,150), lwd=.01)
  	abline(h=0, col="white", lwd=4)
  	barplot(ifelse((vuso[,"num"]+biopsy_matched[,"num"]+biopsy_subthreshold[,"num"])<110, (vuso[,"num"]+biopsy_matched[,"num"]+biopsy_subthreshold[,"num"]), 120), col=variant_cols["VUSo"], border="black", space=.16, add=TRUE, axes=FALSE, lwd=.01)
  	barplot(biopsy_matched[,"num"]+biopsy_subthreshold[,"num"], col=variant_cols["IMPACT-BAM_matched"], border="black", space=.16, add=TRUE, axes=FALSE, lwd=.01)
  	barplot(biopsy_matched[,"num"], col=variant_cols["biopsy_matched"], border="black", space=.16, add=TRUE, axes=FALSE, lwd=.01)
  	if (i==1 | i==3) {
  		axis(2, at = c(-30,-15,0,25,50,75,100,115), labels=c(30,15,0,25,50,75,100,110), cex.axis = 1, las = 1)
  		axis(2, at = c(130, 150), labels=c(550,600), cex.axis = 1, las = 1)
  	} else {
		axis(2, at = c(-30,-15,0,25,50,75,100,115), labels=rep("",8), cex.axis = 1, las = 1)
		axis(2, at = c(130, 150), labels=c("",""), cex.axis = 1.4, las = 1)
	}
	axis(2, at = c(100, 150), labels=c("",""), cex.axis = 1, las = 1, lwd.ticks=-1)
  	axis.break(axis=2, breakpos=123, pos=NA,bgcol="white",breakcol="black",style="slash",brw=0.02)
	if (i==2) {
		rect(xleft=zzz[36]-.565, xright=zzz[36]+.565, ybottom=130, ytop=130+(587-550)-20, col=variant_cols["VUSo"], border="black")
	}
	title(main=subj[i])
}
screen(zz[5])
plot(0,0, type="n", xlim=c(0,10), ylim=c(-1,1), xlab="", ylab="", axes=FALSE, frame.plot=FALSE)
mtext(side = 2, text = expression("Somatic cfDNA variants / Mb"), line = 1.5, cex = 1.4)
close.screen(all.screens=TRUE)
dev.off()

export_x = export_x %>%
		   dplyr::select(`patient_id` = `patient_id`,
		   				 `tissue` = `subj_type`,
		   				 `bio_source` = `bio_source`,
		   				 `n` = `num`) %>%
		   mutate(n = abs(n)) %>%
		   mutate(bio_source = case_when(
		   		  bio_source == "WBC_matched" ~ "wbc_matched",
		   		  bio_source == "VUSo" ~ "vuso",
		   		  bio_source == "biopsy_matched" ~ "biopsy_matched",
		   		  bio_source == "IMPACT-BAM_matched" ~ "biopsy_subthreshold"))
write_tsv(export_x, path="../res/etc/Source_Data_Extended_Data_Fig_6/Extended_Data_Fig_6a.tsv", append=FALSE, col_names=TRUE)

#==================================================
# heatmap of wbc-matched mutations with hypermutators
#==================================================
clinical = read_tsv(file=clinical_file, col_types = cols(.default = col_character())) %>%
		   type_convert() %>%
		   mutate(subj_type = ifelse(subj_type == "Healthy", "Control", subj_type))
		   
snv_vars = read_tsv(snv_file$scored, col_types = cols(.default = col_character())) %>%
		   type_convert() %>%
		   mutate(level_2a = as.character(level_2a)) %>%
		   mutate(level_r1 = as.character(level_r1))
		   
indel_vars = read_tsv(indel_file$scored, col_types = cols(.default = col_character())) %>%
			 type_convert() %>%
			 mutate(level_2a = as.character(level_2a)) %>%
		     mutate(level_r1 = as.character(level_r1))

wbc_stack = read_tsv(wbc_variants$scored, col_types = cols(.default = col_character())) %>%
			type_convert()
			
msk_anno = read_tsv(msk_anno_joined, col_types = cols(.default = col_character())) %>%
  		   type_convert()
			
tracker_grail = read_csv(file=patient_tracker)

tracker_impact = read_csv(file=impact_tracker)

bed_file = rtracklayer::import.bed(con=common_bed)
bed_ranges = GenomicRanges::ranges(bed_file)
total_bed_Mb = round(sum(GenomicRanges::width(bed_ranges)) / 1e6, 1)

valid_patient_ids = tracker_grail %>%
					filter(patient_id %in% tracker_impact$patient_id) %>%
					.[["patient_id"]]
  
indel_vars = indel_vars %>%
			 mutate(filter = replace(filter,
             		patient_id == "MSK-VB-0001" &
             		gene == "GATA3" &
             		filter == "PASS",
             		"CSR_MATCH_ELIMINATED"),
         	 		ccd = replace(ccd,
             			   		  patient_id == "MSK-VB-0001" &
                           		  gene == "GATA3" &
                           		  filter == "CSR_MATCH_ELIMINATED",
                           		  0))

snv_plasma = snv_vars %>%
  			 filter(ccd == 1,
         			(c_panel == 1 | panel == 1),
         			study == "TechVal",
         			grail == 1,
         			patient_id %in% valid_patient_ids) %>%
			 mutate(vtype = "SNV")

indel_plasma = indel_vars %>%
			   filter(ccd == 1,
         			  (c_panel == 1 | panel == 1),
         			  study == "TechVal",
         			  grail == 1,
         			  patient_id %in% valid_patient_ids) %>%
  			   mutate(vtype = "INDEL",
         			  altenddistmedian = as.integer(altenddistmedian))
         			  
healthy_snv = snv_vars %>%
  			  filter((c_panel == 1 | panel == 1),
         			  subj_type == "Healthy",
         			  grail == 1) %>%
  			  mutate(vtype = "SNV")

healthy_indel = indel_vars %>%
  				filter((c_panel == 1 | panel == 1),
         			    subj_type == "Healthy",
         				grail == 1) %>%
  				mutate(vtype = "INDEL",
         			   altenddistmedian = as.integer(altenddistmedian))

small_vars_plasma = full_join(snv_plasma, indel_plasma) %>%
					full_join(healthy_snv) %>%
					full_join(healthy_indel)
small_vars_plasma = small_vars_plasma %>%
  					mutate(subj_type = ifelse(subj_type == "Healthy", "Control", subj_type))
  					
small_vars_plasma = small_vars_plasma %>%
					mutate(loc = str_c(chrom, ":", position_orig, "_", ref_orig, ">", alt_orig))  					

variants = label_bio_source(small_vars_plasma)

variants = left_join(variants, msk_anno %>% dplyr::select(patient_id, chrom, position, ref, alt, CASE:complex_indel_duplicate))
variants = variants %>%
		   mutate(bio_source = case_when(
		   					   MSK == 1 & grail == 1 ~ "biopsy_matched",
		   					   MSK == 1 & grail == 0 ~ "biopsy_only",
		   					   category %in% c("artifact", "edge", "low_depth", "low_qual") ~ "noise",
		   					   category %in% c("germline", "germlineish") ~ "germline",
		   					   category %in% c("blood", "bloodier") ~ "WBC_matched",
		   					   category == "somatic" ~ "VUSo",
		   					   TRUE ~ "other"),
		   		  af = ifelse(is.na(af), 0, af),
		   		  af_nobaq = round(adnobaq / dpnobaq * 100, 2),
		   		  af_nobaq = ifelse(is.na(af_nobaq), 0, af_nobaq))
		   		  
variants = variants %>%
 		   filter(bio_source=="WBC_matched", is_nonsyn)
unique_genes = na.omit(unique(variants$gene))
subj_types = c("Control", "Breast", "Lung", "Prostate")
n = matrix(NA, nrow=length(unique_genes), ncol=4, dimnames=list(unique_genes, subj_types))
for (i in 1:length(unique_genes)) {
	for (j in 1:length(subj_types)) {
		index = which(variants$gene==unique_genes[i] & variants$subj_type==subj_types[j])
		n[i,j] = length(index)
	}
}
index = order(apply(n, 1, sum), decreasing=TRUE)
n = n[index,,drop=FALSE]
pdf(file="../res/figureS8/heatmap_wbc_matched_by_variants.pdf", width=5, height=21)
corrplot(corr=n[1:40,,drop=FALSE], method="color", type="full", add=FALSE,
		 col=colorRampPalette(c(rep("#ffffff",2), "#19547b"))(100), bg=NULL, is.corr=FALSE,
		 addgrid.col="white", diag=TRUE, outline=FALSE, mar=c(1, 0, 1, 0), cl.pos="n",
 		 addCoef.col = "black", number.font = 1, number.cex = 1.15, tl.cex = 1.5, tl.col = "black", tl.offset = 0.5)
dev.off()

export_x = as.data.frame(n) %>%
		   mutate(gene_id = rownames(n)) %>%
		   dplyr::select(gene_id, `control` = Control, `breast` = Breast, `lung` = Lung, `prostate` = Prostate)
export_x = export_x[1:40,,drop=FALSE]
write_tsv(export_x, path="../res/etc/Source_Data_Extended_Data_Fig_6/Extended_Data_Fig_6b.tsv", append=FALSE, col_names=TRUE)

subj_num_smry = variants %>%
				group_by(subj_type) %>%
				dplyr::summarize(subj_num = length(unique(patient_id))) %>%
				ungroup() %>%
				mutate(subj_type_num = paste0(subj_type, "(", subj_num, ")"))

gene_recurrences = variants %>%
 				   filter(!is.na(gene)) %>%
				   group_by(bio_source, subj_type, gene, is_nonsyn) %>%
 				   dplyr::summarize(number = n(), num_patient = length(unique(patient_id))) %>%
 				   ungroup %>%
 				   left_join(subj_num_smry) %>%
 				   mutate(percent_patient = round(num_patient / subj_num * 100, 0))
   
gene_list = c()
gene_stats_table_list = list()

for (subj in subj_num_smry$subj_type) {
   top_wbc_gene_ids = gene_recurrences %>%
  					  filter(subj_type == subj, bio_source == "WBC_matched", is_nonsyn) %>%
   					  group_by(gene) %>%
    				  dplyr::summarize(all = sum(percent_patient)) %>%
   					  ungroup() %>%
   					  arrange(-all) %>%
   					  dplyr::slice(1:20) %>%
   					  .[["gene"]]
   
   gene_list = c(gene_list, top_wbc_gene_ids)
}
 
for (subj in c("Control", "Breast", "Lung", "Prostate")) {
  subj_smry = gene_recurrences %>%
   			  filter(subj_type == subj, bio_source == "WBC_matched", is_nonsyn, gene %in% unique(gene_list)) %>%
  			  dplyr::select(gene, num_patient)
   
  gene_stats_table_list[[subj]] = subj_smry
}

gene_stats_smry = data.frame(gene = unique(gene_list))
gene_stats_smry = left_join(gene_stats_smry, gene_stats_table_list[["Control"]], by = "gene") %>%
				  left_join(gene_stats_table_list[["Breast"]], by = "gene") %>%
				  left_join(gene_stats_table_list[["Lung"]], by = "gene") %>%
				  left_join(gene_stats_table_list[["Prostate"]], by = "gene")
colnames(gene_stats_smry) = c("gene", "Control", "Breast", "Lung", "Prostate")
gene_stats_smry[is.na(gene_stats_smry)] = 0
gene_stats_mean = gene_stats_smry %>%
				  dplyr::select(-gene) %>%
 				  rowMeans()
gene_stats_smry = bind_cols(gene_stats_smry, Mean = gene_stats_mean) %>%
				  arrange(-Mean)

gene_stats_matrix = as.matrix(gene_stats_smry %>%
					dplyr::select(-gene, -Mean))
rownames(gene_stats_matrix) = gene_stats_smry$gene
idx_names = match(colnames(gene_stats_matrix), subj_num_smry$subj_type)
colnames(gene_stats_matrix) = subj_num_smry$subj_type_num[idx_names]

max_num = max(gene_stats_matrix) + 3
my_palette = colorRampPalette(c("white","#1F6784"))(n = 29)
col_breaks = c(seq(0, 5, length = 10), seq(5.1, max_num, length = 20))
colnames(gene_stats_matrix) = unlist(lapply(strsplit(subj_num_smry$subj_type_num[idx_names], "(", fixed=TRUE), function(x) {x[1]}))

pdf(file="../res/figureS8/heatmap_wbc_matched_by_gene.pdf", width=5, height=21)
corrplot(corr=gene_stats_matrix, method="color", type="full", add=FALSE,
		 col=colorRampPalette(c(rep("#ffffff",2), "#19547b"))(100), bg=NULL, is.corr=FALSE,
		 addgrid.col="white", diag=TRUE, outline=FALSE, mar=c(1, 0, 1, 0), cl.pos="n",
 		 addCoef.col = "black", number.font = 1, number.cex = 1.15, tl.cex = 1.5, tl.col = "black", tl.offset = 0.5)
dev.off()

#==================================================
# scatter plot of vaf in wbc and cfdna
#==================================================
clinical = read_tsv(file=clinical_file, col_types = cols(.default = col_character())) %>%
		   type_convert() %>%
		   mutate(subj_type = ifelse(subj_type == "Healthy", "Control", subj_type))
		   
snv_vars = read_tsv(snv_file$scored, col_types = cols(.default = col_character())) %>%
		   type_convert() %>%
		   mutate(level_2a = as.character(level_2a)) %>%
		   mutate(level_r1 = as.character(level_r1))
		   
indel_vars = read_tsv(indel_file$scored, col_types = cols(.default = col_character())) %>%
			 type_convert() %>%
			 mutate(level_2a = as.character(level_2a)) %>%
		     mutate(level_r1 = as.character(level_r1))

wbc_stack = read_tsv(wbc_variants$scored, col_types = cols(.default = col_character())) %>%
			type_convert()
			
msk_anno = read_tsv(msk_anno_joined, col_types = cols(.default = col_character())) %>%
  		   type_convert()
			
tracker_grail = read_csv(file=patient_tracker)

tracker_impact = read_csv(file=impact_tracker)

valid_patient_ids = tracker_grail %>%
					filter(patient_id %in% tracker_impact$patient_id) %>%
					.[["patient_id"]]
  
indel_vars = indel_vars %>%
			 mutate(filter = replace(filter,
             		patient_id == "MSK-VB-0001" &
             		gene == "GATA3" &
             		filter == "PASS",
             		"CSR_MATCH_ELIMINATED"),
         	 		ccd = replace(ccd,
             			   		  patient_id == "MSK-VB-0001" &
                           		  gene == "GATA3" &
                           		  filter == "CSR_MATCH_ELIMINATED",
                           		  0))

snv_plasma = snv_vars %>%
  			 filter(ccd == 1,
         			(c_panel == 1 | panel == 1),
         			study == "TechVal",
         			grail == 1 | MSK == 1,
         			patient_id %in% valid_patient_ids) %>%
			 mutate(vtype = "SNV")

indel_plasma = indel_vars %>%
			   filter(ccd == 1,
         			  (c_panel == 1 | panel == 1),
         			  study == "TechVal",
         			  grail == 1 | MSK == 1,
         			  patient_id %in% valid_patient_ids) %>%
  			   mutate(vtype = "INDEL",
         			  altenddistmedian = as.integer(altenddistmedian))
         			  
healthy_snv = snv_vars %>%
  			  filter((c_panel == 1 | panel == 1),
         			  subj_type == "Healthy",
         			  grail == 1) %>%
  			  mutate(vtype = "SNV")

healthy_indel = indel_vars %>%
  				filter((c_panel == 1 | panel == 1),
         			    subj_type == "Healthy",
         				grail == 1) %>%
  				mutate(vtype = "INDEL",
         			   altenddistmedian = as.integer(altenddistmedian))

small_vars_plasma = full_join(snv_plasma, indel_plasma) %>%
					full_join(healthy_snv) %>%
					full_join(healthy_indel)
small_vars_plasma = small_vars_plasma %>%
  					mutate(subj_type = ifelse(subj_type == "Healthy", "Control", subj_type))

small_vars_plasma = small_vars_plasma %>%
					mutate(loc = str_c(chrom, ":", position_orig, "_", ref_orig, ">", alt_orig))

all_patient_table = small_vars_plasma %>%
					distinct(subj_type, patient_id)
					
all_patient_table = cbind.data.frame(subj_type = rep(all_patient_table$subj_type, 4),
                                     patient_id = rep(all_patient_table$patient_id, 4),
                                     bio_source = rep(c("WBC_matched",
                                                        "VUSo",
                                                        "biopsy_matched",
                                                        "biopsy_only"),
                                                       each = nrow(all_patient_table)))

variants = label_bio_source(small_vars_plasma)

variants = left_join(variants, msk_anno %>% dplyr::select(patient_id, chrom, position, ref, alt, CASE:complex_indel_duplicate))
variants = variants %>%
		   mutate(bio_source = case_when(
		   					   MSK == 1 & grail == 1 ~ "biopsy_matched",
		   					   MSK == 1 & grail == 0 ~ "biopsy_only",
		   					   category %in% c("artifact", "edge", "low_depth", "low_qual") ~ "noise",
		   					   category %in% c("germline", "germlineish") ~ "germline",
		   					   category %in% c("blood", "bloodier") ~ "WBC_matched",
		   					   category == "somatic" & `IM-T.alt_count` > bam_read_count_cutoff ~ "IMPACT-BAM_matched",
		   					   category == "somatic" ~ "VUSo",
		   					   TRUE ~ "other"),
		   		  af = ifelse(is.na(af), 0, af),
		   		  af_nobaq = round(adnobaq / dpnobaq * 100, 2),
		   		  af_nobaq = ifelse(is.na(af_nobaq), 0, af_nobaq))


variants_nohyper = variants %>%
		   		   filter(!patient_id %in% c(hypermutators$patient_id, msi_hypermutators$patient_id))
cfdna_vs_gdna_vaf_plot = variants_nohyper %>%
						 filter(is_nonsyn) %>%
						 filter(!is.na(afmeancfdna)) %>%
						 filter(bio_source %in% c("WBC_matched", "biopsy_matched","VUSo", "IMPACT-BAM_matched")) %>%
						 mutate(afmeancfdna = afmeancfdna * 100) %>%
						 mutate(afmeangdna = afmeangdna * 100) %>%
						 mutate(afcfdna_nobaq = 100 * (adnobaq+2)/(dpnobaq+4)) %>%
						 mutate(afgdna_nobaq = 100 * (adgdna+2)/(dpgdna+4)) %>%
						 mutate(afcfdna_nobaq_nos = 100 * (adnobaq)/(dpnobaq)) %>%
						 mutate(afgdna_nobaq_nos = 100 * (adgdna)/(dpgdna)) %>%
						 mutate(afcfdna_nobaq_nos = ifelse(is.na(afcfdna_nobaq_nos) | afcfdna_nobaq_nos==0, 0.01, afcfdna_nobaq_nos)) %>%
						 mutate(afgdna_nobaq_nos = ifelse(is.na(afgdna_nobaq_nos) | afgdna_nobaq_nos==0, 0.01, afgdna_nobaq_nos)) %>%
						 mutate(facets_1 = "Mean posterior VAF") %>%
						 mutate(facets_2 = "VAF with pseudocounts without BAQ") %>%
						 mutate(facets_3 = "VAF without pseudocounts without BAQ") %>%
						 mutate(bio_source = case_when(
						 	bio_source == "biopsy_matched" ~ "Biopsy matched",
						 	bio_source == "IMPACT-BAM_matched" ~ "Biopsy subthreshold",
						 	bio_source == "VUSo" ~ "VUSo",
						 	bio_source == "WBC_matched" ~ "WBC matched",
						 ))

cols = variant_cols
names(cols) = c("VUSo", "Biopsy matched", "Biopsy subthreshold", "WBC matched", "Biopsy only")
         
pdf(file="../res/figureS8/VAF_VAF_posterior_mean.pdf", width=7, height=7)
par(mar = c(6.1, 6, 4.1, 1))
plot(1,1, type="n", xlim=c(0.01,15), ylim=c(0.01,15), xlab="", ylab="", axes=FALSE, frame.plot=FALSE, log="xy")
points(x=c(0.01, 15), y=c(0.01,15), type="l", lty=1, lwd=2, col="goldenrod3")
x = cfdna_vs_gdna_vaf_plot$afmeancfdna
y = cfdna_vs_gdna_vaf_plot$afmeangdna
x[x>15] = NA
y[y>15] = NA
for (i in c("VUSo", "WBC matched", "Biopsy matched", "Biopsy subthreshold")) {
	index = cfdna_vs_gdna_vaf_plot$bio_source==i
	points(x[index], y[index], type="p", col="black", bg=cols[i], pch=21, lwd=.9, cex=1.15)
}
axis(1, at = c(.01, .10, 1.0, 10.0, 15.0), labels = c(expression(10^-2), expression(10^-1), "1", "10", ""), cex.axis = 1.5, padj = 0.25, lwd=1.75, lwd.ticks=1.65)
axis(1, at = 15, labels = "15", cex.axis = 1.5, padj = 0.25, lwd=-1, las=1)
axis(2, at = c(.01, .10, 1.0, 10.0, 15.0), labels = c(expression(10^-2), expression(10^-1), "1", "10", ""), cex.axis = 1.5, las = 1, lwd=1.75, lwd.ticks=1.65)
axis(2, at = 15, labels = "15", cex.axis = 1.5, padj = 0.25, lwd=-1, las=1)
log10_axis(side=1, at=c(.01, .1, 1, 10), lwd=0, lwd.ticks=1)
log10_axis(side=2, at=c(.01, .1, 1, 10), lwd=0, lwd.ticks=1)
mtext(side = 1, text = "VAF in cfDNA (%)", line = 4, cex = 1.85)
mtext(side = 2, text = "VAF in WBC (%)", line = 4, cex = 1.85)
legend(x=0.009, y=19, pch=21, col="black", pt.bg=cols, pt.cex=1.55, pt.lwd=.5, legend=c("Biopsy matched", "Biopsy subthreshold", "VUSo", "WBC matched"), box.lwd=-1, cex=1.15)
dev.off()

pdf(file="../res/figureS8/VAF_VAF_pseudo_no_BAQ.pdf", width=7, height=7)
par(mar = c(6.1, 6, 4.1, 1))
plot(1,1, type="n", xlim=c(0.01,15), ylim=c(0.01,15), xlab="", ylab="", axes=FALSE, frame.plot=FALSE, log="xy")
points(x=c(0.01, 15), y=c(0.01,15), type="l", lty=1, lwd=2, col="goldenrod3")
x = cfdna_vs_gdna_vaf_plot$afcfdna_nobaq
y = cfdna_vs_gdna_vaf_plot$afgdna_nobaq
x[x>15] = NA
y[y>15] = NA
for (i in c("VUSo", "WBC matched", "Biopsy matched", "Biopsy subthreshold")) {
	index = cfdna_vs_gdna_vaf_plot$bio_source==i
	points(x[index], y[index], type="p", col="black", bg=cols[i], pch=21, lwd=.9, cex=1.15)
}
axis(1, at = c(.01, .10, 1.0, 10.0, 15.0), labels = c(expression(10^-2), expression(10^-1), "1", "10", ""), cex.axis = 1.5, padj = 0.25, lwd=1.75, lwd.ticks=1.65)
axis(1, at = 15, labels = "15", cex.axis = 1.5, padj = 0.25, lwd=-1, las=1)
axis(2, at = c(.01, .10, 1.0, 10.0, 15.0), labels = c(expression(10^-2), expression(10^-1), "1", "10", ""), cex.axis = 1.5, las = 1, lwd=1.75, lwd.ticks=1.65)
axis(2, at = 15, labels = "15", cex.axis = 1.5, padj = 0.25, lwd=-1, las=1)
log10_axis(side=1, at=c(.01, .1, 1, 10), lwd=0, lwd.ticks=1)
log10_axis(side=2, at=c(.01, .1, 1, 10), lwd=0, lwd.ticks=1)
mtext(side = 1, text = "VAF in cfDNA (%)", line = 4, cex = 1.85)
mtext(side = 2, text = "VAF in WBC (%)", line = 4, cex = 1.85)
legend(x=0.009, y=19, pch=21, col="black", pt.bg=cols, pt.cex=1.55, pt.lwd=.5, legend=c("Biopsy matched", "Biopsy subthreshold", "VUSo", "WBC matched"), box.lwd=-1, cex=1.15)
dev.off()

export_x = cfdna_vs_gdna_vaf_plot %>%
		   dplyr::select(`patient_id` = `patient_id`,
		   				 `tissue` = `subj_type`,
		   				 `gene_id` = `gene_id`,
		   				 `chromosome` = `chrom`,
		   				 `position` = `position`,
		   				 `reference_allele` = `ref_orig`,
		   				 `alternate_allele` = `alt_orig`,
		   				 `cfdna_af` = `afcfdna_nobaq`,
		   				 `wbc_af` = `afgdna_nobaq`,
		   				 `bio_source` = `bio_source`) %>%
		   mutate(bio_source = case_when(
		   		  bio_source == "WBC_matched" ~ "wbc_matched",
		   		  bio_source == "VUSo" ~ "vuso",
		   		  bio_source == "biopsy_matched" ~ "biopsy_matched",
		   		  bio_source == "IMPACT-BAM_matched" ~ "biopsy_subthreshold"))
write_tsv(export_x, path="../res/etc/Source_Data_Extended_Data_Fig_6/Extended_Data_Fig_6c.tsv", append=FALSE, col_names=TRUE)

pdf(file="../res/figureS8/VAF_VAF_nopsedo_no_BAQ.pdf", width=7, height=7)
par(mar = c(6.1, 6, 4.1, 1))
plot(1,1, type="n", xlim=c(0.01,15), ylim=c(0.01,15), xlab="", ylab="", axes=FALSE, frame.plot=FALSE, log="xy")
points(x=c(0.01, 15), y=c(0.01,15), type="l", lty=1, lwd=2, col="goldenrod3")
x = cfdna_vs_gdna_vaf_plot$afcfdna_nobaq_nos
y = cfdna_vs_gdna_vaf_plot$afgdna_nobaq_nos
x[x>15] = NA
y[y>15] = NA
for (i in c("VUSo", "WBC matched", "Biopsy matched", "Biopsy subthreshold")) {
	index = cfdna_vs_gdna_vaf_plot$bio_source==i
	points(x[index], y[index], type="p", col="black", bg=cols[i], pch=21, lwd=.9, cex=1.15)
}
axis(1, at = c(.01, .10, 1.0, 10.0, 15.0), labels = c(expression(10^-2), expression(10^-1), "1", "10", ""), cex.axis = 1.5, padj = 0.25, lwd=1.75, lwd.ticks=1.65)
axis(1, at = 15, labels = "15", cex.axis = 1.5, padj = 0.25, lwd=-1, las=1)
axis(2, at = c(.01, .10, 1.0, 10.0, 15.0), labels = c(expression(10^-2), expression(10^-1), "1", "10", ""), cex.axis = 1.5, las = 1, lwd=1.75, lwd.ticks=1.65)
axis(2, at = 15, labels = "15", cex.axis = 1.5, padj = 0.25, lwd=-1, las=1)
log10_axis(side=1, at=c(.01, .1, 1, 10), lwd=0, lwd.ticks=1)
log10_axis(side=2, at=c(.01, .1, 1, 10), lwd=0, lwd.ticks=1)
mtext(side = 1, text = "VAF in cfDNA (%)", line = 4, cex = 1.85)
mtext(side = 2, text = "VAF in WBC (%)", line = 4, cex = 1.85)
legend(x=0.009, y=19, pch=21, col="black", pt.bg=cols, pt.cex=1.55, pt.lwd=.5, legend=c("Biopsy matched", "Biopsy subthreshold", "VUSo", "WBC matched"), box.lwd=-1, cex=1.15)
dev.off()

export_x = cfdna_vs_gdna_vaf_plot %>%
		   dplyr::select(`patient_id` = `patient_id`,
		   				 `tissue` = `subj_type`,
		   				 `gene_id` = `gene_id`,
		   				 `chromosome` = `chrom`,
		   				 `position` = `position`,
		   				 `reference_allele` = `ref_orig`,
		   				 `alternate_allele` = `alt_orig`,
		   				 `cfdna_af` = `afcfdna_nobaq_nos`,
		   				 `wbc_af` = `afgdna_nobaq_nos`,
		   				 `bio_source` = `bio_source`) %>%
		   mutate(bio_source = case_when(
		   		  bio_source == "WBC_matched" ~ "wbc_matched",
		   		  bio_source == "VUSo" ~ "vuso",
		   		  bio_source == "biopsy_matched" ~ "biopsy_matched",
		   		  bio_source == "IMPACT-BAM_matched" ~ "biopsy_subthreshold"))
write_tsv(export_x, path="../res/etc/Source_Data_Extended_Data_Fig_6/Extended_Data_Fig_6d.tsv", append=FALSE, col_names=TRUE)
ndbrown6/MSK-GRAIL-TECHVAL documentation built on March 29, 2020, 4:41 p.m.