R/0_exec_02_concordance_analysis.R

#==================================================
# David Brown
# brownd7@mskcc.org
#==================================================
rm(list=ls(all=TRUE))
source('config.R')

if (!dir.exists("../res/figure2")) {
	dir.create("../res/figure2")
}

if (!dir.exists("../res/etc/Source_Data_Fig_2")) {
	dir.create("../res/etc/Source_Data_Fig_2")
}

#==================================================
# barplot of recurrent genes including
# hypermutators
#==================================================
clinical = read_tsv(file=clinical_file, col_types = cols(.default = col_character())) %>%
		   type_convert() %>%
		   mutate(subj_type = ifelse(subj_type == "Healthy", "Control", subj_type))
		   
snv_vars = read_tsv(snv_file$scored, col_types = cols(.default = col_character())) %>%
		   type_convert() %>%
		   mutate(level_2a = as.character(level_2a)) %>%
		   mutate(level_r1 = as.character(level_r1))
		   
indel_vars = read_tsv(indel_file$scored, col_types = cols(.default = col_character())) %>%
			 type_convert() %>%
			 mutate(level_2a = as.character(level_2a)) %>%
		     mutate(level_r1 = as.character(level_r1))

wbc_stack = read_tsv(wbc_variants$scored, col_types = cols(.default = col_character())) %>%
			type_convert()
			
msk_anno = read_tsv(msk_anno_joined, col_types = cols(.default = col_character())) %>%
  		   type_convert()
			
tracker_grail = read_csv(file=patient_tracker)

tracker_impact = read_csv(file=impact_tracker)

valid_patient_ids = tracker_grail %>%
					filter(patient_id %in% tracker_impact$patient_id) %>%
					.[["patient_id"]]
  
indel_vars = indel_vars %>%
			 mutate(filter = replace(filter,
             		patient_id == "MSK-VB-0001" &
             		gene == "GATA3" &
             		filter == "PASS",
             		"CSR_MATCH_ELIMINATED"),
         	 		ccd = replace(ccd,
             			   		  patient_id == "MSK-VB-0001" &
                           		  gene == "GATA3" &
                           		  filter == "CSR_MATCH_ELIMINATED",
                           		  0))

snv_plasma = snv_vars %>%
  			 filter(ccd == 1,
         			(c_panel == 1 | panel == 1),
         			study == "TechVal",
         			grail == 1 | MSK == 1,
         			patient_id %in% valid_patient_ids) %>%
			 mutate(vtype = "SNV")

indel_plasma = indel_vars %>%
			   filter(ccd == 1,
         			  (c_panel == 1 | panel == 1),
         			  study == "TechVal",
         			  grail == 1 | MSK == 1,
         			  patient_id %in% valid_patient_ids) %>%
  			   mutate(vtype = "INDEL",
         			  altenddistmedian = as.integer(altenddistmedian))
         			  
healthy_snv = snv_vars %>%
  			  filter((c_panel == 1 | panel == 1),
         			  subj_type == "Healthy",
         			  grail == 1) %>%
  			  mutate(vtype = "SNV")

healthy_indel = indel_vars %>%
  				filter((c_panel == 1 | panel == 1),
         			    subj_type == "Healthy",
         				grail == 1) %>%
  				mutate(vtype = "INDEL",
         			   altenddistmedian = as.integer(altenddistmedian))

small_vars_plasma = full_join(snv_plasma, indel_plasma) %>%
					full_join(healthy_snv) %>%
					full_join(healthy_indel)
small_vars_plasma = small_vars_plasma %>%
  					mutate(subj_type = ifelse(subj_type == "Healthy", "Control", subj_type))

small_vars_plasma = small_vars_plasma %>%
					mutate(loc = str_c(chrom, ":", position_orig, "_", ref_orig, ">", alt_orig))

all_patient_table = small_vars_plasma %>%
					distinct(subj_type, patient_id)
					
all_patient_table = cbind.data.frame(subj_type = rep(all_patient_table$subj_type, 4),
                                     patient_id = rep(all_patient_table$patient_id, 4),
                                     bio_source = rep(c("WBC_matched",
                                                        "VUSo",
                                                        "biopsy_matched",
                                                        "biopsy_only"),
                                                       each = nrow(all_patient_table)))

variants = label_bio_source(small_vars_plasma)

variants = left_join(variants, msk_anno %>% dplyr::select(patient_id, chrom, position, ref, alt, CASE:complex_indel_duplicate))
variants = variants %>%
		   mutate(bio_source = case_when(
		   					   MSK == 1 & grail == 1 ~ "biopsy_matched",
		   					   MSK == 1 & grail == 0 ~ "biopsy_only",
		   					   category %in% c("artifact", "edge", "low_depth", "low_qual") ~ "noise",
		   					   category %in% c("germline", "germlineish") ~ "germline",
		   					   category %in% c("blood", "bloodier") ~ "WBC_matched",
		   					   category == "somatic" & `IM-T.alt_count` > bam_read_count_cutoff ~ "IMPACT-BAM_matched",
		   					   category == "somatic" ~ "VUSo",
		   					   TRUE ~ "other"),
		   		  af = ifelse(is.na(af), 0, af),
		   		  af_nobaq = round(adnobaq / dpnobaq * 100, 2),
		   		  af_nobaq = ifelse(is.na(af_nobaq), 0, af_nobaq))
		   		  
# sum the number of times a gene occurs in a bio_source in each tissue type
subj_num_smry = variants %>%
				group_by(subj_type) %>%
    			dplyr::summarize(subj_num = length(unique(patient_id))) %>%
    			ungroup()
  
gene_recurrences = variants %>%
				   filter(!is.na(gene), bio_source %in% c("biopsy_matched", "biopsy_only") | (bio_source %in% c("IMPACT-BAM_matched", "VUSo") & is_nonsyn)) %>%
 				   group_by(bio_source, subj_type, gene) %>%
 				   dplyr::summarize(number = n(), num_patient = length(unique(patient_id))) %>%
 				   ungroup %>%
 				   left_join(subj_num_smry) %>%
 				   mutate(percent_patient = round(num_patient / subj_num * 100, 0))
				   
gene_list = c()
for (subj in subj_num_smry$subj_type[subj_num_smry$subj_type != "Control"]) {
	top_cancer_gene_ids = gene_recurrences %>%
						  filter(subj_type == subj,
						  bio_source %in% c("biopsy_matched", "biopsy_only")) %>%
 						  group_by(gene) %>%
 						  dplyr::summarize(all = sum(percent_patient)) %>%
 						  ungroup() %>%
 						  arrange(-all) %>%
 						  slice(1:15) %>%
 						  .[["gene"]]
 	gene_list = c(gene_list, top_cancer_gene_ids)
}
gene_list = unique(gene_list)

top_cancer_genes_ordered = gene_recurrences %>%
 						   filter(bio_source %in% c("biopsy_matched", "biopsy_only"), gene %in% gene_list) %>%
 						   group_by(gene) %>%
 						   dplyr::summarize(perc = sum(percent_patient)) %>%
 						   ungroup() %>%
 						   arrange(-perc) %>%
 						   .[["gene"]]
   
top_cancer_genes_table = gene_recurrences %>%
 						 filter(bio_source %in% c("biopsy_matched", "IMPACT-BAM_matched", "VUSo"), gene %in% gene_list, subj_type != "Control") %>%
 						 dplyr::select(subj_type, gene, percent_patient, bio_source)
   
top_cancer_genes_table = top_cancer_genes_table %>%
 						 mutate(gene = factor(gene, levels = top_cancer_genes_ordered),
 						 subj_type = factor(subj_type, levels = c("Breast", "Lung", "Prostate")),
 						 bio_source = factor(bio_source, levels = c("VUSo", "biopsy_matched", "IMPACT-BAM_matched")))

pdf(file="../res/figure2/recurrent_genes_combined.pdf", width=10, height=6.7)
par(mar = c(6.1, 6, 4.1, 1))
zz = split.screen(figs=matrix(c(0,1,2/3-.16,1, 0,1,1/3-.08,2/3+.08, 0,1,0,1/3+.16, 0,1,0,1), nrow=4, ncol=4, byrow=TRUE))
cancer_types = c("Breast", "Lung", "Prostate")
for (i in 1:length(cancer_types)) {
	screen(zz[i])
	x.1 = top_cancer_genes_table %>%
		  filter(subj_type==cancer_types[i], bio_source=="biopsy_matched")
	x.2 = top_cancer_genes_table %>%
		  filter(subj_type==cancer_types[i], bio_source=="IMPACT-BAM_matched")
	x.3 = top_cancer_genes_table %>%
		  filter(subj_type==cancer_types[i], bio_source=="VUSo")
	x = matrix(0, nrow=length(top_cancer_genes_ordered), ncol=3)
 	rownames(x) = top_cancer_genes_ordered
 	colnames(x) = c("biopsy_matched","IMPACT-BAM_matched","VUSo")
 	if (nrow(x.1)!=0) {
 		x[as.character(x.1$gene),"biopsy_matched"] = as.numeric(x.1$percent_patient)
 	}
 	if (nrow(x.2)!=0) {
 		x[as.character(x.2$gene),"IMPACT-BAM_matched"] = as.numeric(x.2$percent_patient)
 	}
 	if (nrow(x.3)!=0) {
 		x[as.character(x.3$gene),"VUSo"] = as.numeric(x.3$percent_patient)
 	}
 	zzz = barplot(t(x), beside=FALSE, names.arg=rep("",nrow(x)), col=variant_cols[colnames(x)], axes=FALSE, ylim=c(0,50))
 	axis(2, at = seq(0,50,l=6), labels = seq(0,50,l=6), cex.axis = 1.05, las = 1, line = 0, lwd=1)
 	if (i==3) {
 		axis(side=1, at=zzz, labels=top_cancer_genes_ordered, las=2, font=3, cex=.75, lwd=-1)
 	}
 	title(main=paste0("\n\n",cancer_types[i]), cex.main=1.25)
}
screen(zz[4])
plot(0, 0, type="n", xlim=c(0,10), ylim=c(0,10), xlab="", ylab="", axes=FALSE, frame.plot=FALSE)
mtext(side = 2, text = "% of patients", line = 3, cex = 1.15)
close.screen(all.screens=TRUE)
dev.off()

export_x = top_cancer_genes_table %>%
		   dplyr::select(`tissue` = `subj_type`,
		   				 `gene_id` = `gene`,
		   				 `percent_patient` = `percent_patient`,
		   				 `bio_source` = `bio_source`)
write_tsv(export_x, path="../res/etc/Source_Data_Fig_2/Fig_2b.tsv", append=FALSE, col_names=TRUE)

#==================================================
# barplot of mutation burden and sources of mutation
# per patient and cohort including hypermutators
#==================================================
clinical = read_tsv(file=clinical_file, col_types = cols(.default = col_character())) %>%
		   type_convert() %>%
		   mutate(subj_type = ifelse(subj_type == "Healthy", "Control", subj_type))
		   
snv_vars = read_tsv(snv_file$scored, col_types = cols(.default = col_character())) %>%
		   type_convert() %>%
		   mutate(level_2a = as.character(level_2a)) %>%
		   mutate(level_r1 = as.character(level_r1))
		   
indel_vars = read_tsv(indel_file$scored, col_types = cols(.default = col_character())) %>%
			 type_convert() %>%
			 mutate(level_2a = as.character(level_2a)) %>%
		     mutate(level_r1 = as.character(level_r1))

wbc_stack = read_tsv(wbc_variants$scored, col_types = cols(.default = col_character())) %>%
			type_convert()
			
msk_anno = read_tsv(msk_anno_joined, col_types = cols(.default = col_character())) %>%
  		   type_convert()
			
tracker_grail = read_csv(file=patient_tracker)

tracker_impact = read_csv(file=impact_tracker)

valid_patient_ids = tracker_grail %>%
					filter(patient_id %in% tracker_impact$patient_id) %>%
					.[["patient_id"]]
  
indel_vars = indel_vars %>%
			 mutate(filter = replace(filter,
             		patient_id == "MSK-VB-0001" &
             		gene == "GATA3" &
             		filter == "PASS",
             		"CSR_MATCH_ELIMINATED"),
         	 		ccd = replace(ccd,
             			   		  patient_id == "MSK-VB-0001" &
                           		  gene == "GATA3" &
                           		  filter == "CSR_MATCH_ELIMINATED",
                           		  0))

snv_plasma = snv_vars %>%
  			 filter(ccd == 1,
         			(c_panel == 1 | panel == 1),
         			study == "TechVal",
         			grail == 1 | MSK == 1,
         			patient_id %in% valid_patient_ids) %>%
			 mutate(vtype = "SNV")

indel_plasma = indel_vars %>%
			   filter(ccd == 1,
         			  (c_panel == 1 | panel == 1),
         			  study == "TechVal",
         			  grail == 1 | MSK == 1,
         			  patient_id %in% valid_patient_ids) %>%
  			   mutate(vtype = "INDEL",
         			  altenddistmedian = as.integer(altenddistmedian))
         			  
healthy_snv = snv_vars %>%
  			  filter((c_panel == 1 | panel == 1),
         			  subj_type == "Healthy",
         			  grail == 1) %>%
  			  mutate(vtype = "SNV")

healthy_indel = indel_vars %>%
  				filter((c_panel == 1 | panel == 1),
         			    subj_type == "Healthy",
         				grail == 1) %>%
  				mutate(vtype = "INDEL",
         			   altenddistmedian = as.integer(altenddistmedian))

small_vars_plasma = full_join(snv_plasma, indel_plasma) %>%
					full_join(healthy_snv) %>%
					full_join(healthy_indel)
small_vars_plasma = small_vars_plasma %>%
  					mutate(subj_type = ifelse(subj_type == "Healthy", "Control", subj_type))

small_vars_plasma = small_vars_plasma %>%
					mutate(loc = str_c(chrom, ":", position_orig, "_", ref_orig, ">", alt_orig))

all_patient_table = small_vars_plasma %>%
					distinct(subj_type, patient_id)
					
all_patient_table = cbind.data.frame(subj_type = rep(all_patient_table$subj_type, 4),
                                     patient_id = rep(all_patient_table$patient_id, 4),
                                     bio_source = rep(c("WBC_matched",
                                                        "VUSo",
                                                        "biopsy_matched",
                                                        "biopsy_only"),
                                                       each = nrow(all_patient_table)))

variants = label_bio_source(small_vars_plasma)

variants = left_join(variants, msk_anno %>% dplyr::select(patient_id, chrom, position, ref, alt, CASE:complex_indel_duplicate))
variants = variants %>%
		   mutate(bio_source = case_when(
		   					   MSK == 1 & grail == 1 ~ "biopsy_matched",
		   					   MSK == 1 & grail == 0 ~ "biopsy_only",
		   					   category %in% c("artifact", "edge", "low_depth", "low_qual") ~ "noise",
		   					   category %in% c("germline", "germlineish") ~ "germline",
		   					   category %in% c("blood", "bloodier") ~ "WBC_matched",
		   					   category == "somatic" & `IM-T.alt_count` > bam_read_count_cutoff ~ "IMPACT-BAM_matched",
		   					   category == "somatic" ~ "VUSo",
		   					   TRUE ~ "other"),
		   		  af = ifelse(is.na(af), 0, af),
		   		  af_nobaq = round(adnobaq / dpnobaq * 100, 2),
		   		  af_nobaq = ifelse(is.na(af_nobaq), 0, af_nobaq))

ordered_pat_list = list()
 
pdf(file="../res/figure2/allele_frequency_combined.pdf", width=28, height=10)
par(mar = c(6.1, 6, 4.1, 1))
zz = split.screen(figs=matrix(c(0+.05,1/3+.05,0.39,1, 1/3,2/3,0.39,1, 2/3-.05,1-.05,0.39,1, 0+.05,1/3+.05,0,.6, 1/3,2/3,0,.6, 2/3-.05,1-.05,0,.6), nrow=6, ncol=4, byrow=TRUE))
cancer_types = c("Breast", "Lung", "Prostate")
for (i in 1:length(cancer_types)) {
	subj_small_vars = variants %>%
 					  filter(subj_type == cancer_types[i],
 					  bio_source %in% c("biopsy_matched", "biopsy_only") | (bio_source %in% c("IMPACT-BAM_matched", "VUSo") & is_nonsyn))
 
	ordered_patient_ids = subj_small_vars %>%
 						  group_by(patient_id) %>%
 						  mutate(max_af = max(af_nobaq)) %>%
 						  dplyr::select(patient_id, max_af) %>%
 						  unique() %>%
 						  arrange(max_af) %>%
 						  dplyr::select(patient_id) %>%
 						  unlist()
   
 	if (cancer_types[i] == "Control") {
     	sample_ids = tracker_grail %>%
     				 filter(study == "Merlin", tissue == "Healthy") %>% .[["patient_id"]]
   	} else {
     	sample_ids = tracker_grail %>%
     				 filter(patient_id %in% unique(tracker_impact$patient_id)) %>%
     				 filter(study == "TechVal", tissue == cancer_types[i]) %>% .[["patient_id"]]
   	}
   	zero_ids = sample_ids[!sample_ids %in% ordered_patient_ids]
   
  	if (length(zero_ids) > 0) {
     	sample_id_table = data_frame(patient_id = zero_ids,
                                   	 bio_source = "VUSo",
                                   	 af_nobaq = NA)
     	subj_small_vars = bind_rows(sample_id_table, subj_small_vars %>%
                                    dplyr::select(patient_id, bio_source, af_nobaq))
     	ordered_patient_ids = c(zero_ids, ordered_patient_ids)
 	}
   	subj_small_vars = subj_small_vars %>%
     				  mutate(patient_id = factor(patient_id, levels = ordered_patient_ids),
         			  		 bio_source = factor(bio_source, levels = c("VUSo", "IMPACT-BAM_matched", "biopsy_matched", "biopsy_only")))

	ordered_pat_list[[cancer_types[i]]] = ordered_patient_ids
 	
 	cancer_vars_tissue_ref = variants %>%
  						     filter(subj_type != "Control",
  						     bio_source %in% c("biopsy_matched", "biopsy_only", "VUSo"))
 	per_pat_smry = cancer_vars_tissue_ref %>%
   				   filter(bio_source != "VUSo") %>%
   				   group_by(subj_type, patient_id, bio_source) %>%
   				   dplyr::summarize(num_variants = n()) %>%
   				   ungroup()
  
 	subj_smry = per_pat_smry %>%
   			    filter(subj_type == cancer_types[i])
   			  
 	# add zero-mutation samples
  	pat_ids = unique(subj_smry$patient_id)
  	zero_ids = ordered_pat_list[[cancer_types[i]]][!ordered_pat_list[[cancer_types[i]]] %in% pat_ids]
   	if (length(zero_ids) != 0) {
   		zero_id_table = data_frame(subj_type = cancer_types[i],
     							   patient_id = zero_ids,
     							   bio_source = "biopsy_matched",
     							   num_variants = NA)
     	subj_smry = bind_rows(zero_id_table, subj_smry)
   	}
  
   	plt_smry = data.frame(matrix(0, nrow=length(ordered_patient_ids), ncol=2))
   	rownames(plt_smry) = ordered_patient_ids
   	colnames(plt_smry) = c("biopsy_matched","biopsy_only")
   	tmp = subj_smry$num_variants[subj_smry$patient_id %in% ordered_patient_ids & subj_smry$bio_source=="biopsy_matched"]
   	names(tmp) = subj_smry$patient_id[subj_smry$patient_id %in% ordered_patient_ids & subj_smry$bio_source=="biopsy_matched"]
   	plt_smry[names(tmp),"biopsy_matched"] = tmp
 	tmp = subj_smry$num_variants[subj_smry$patient_id %in% ordered_patient_ids & subj_smry$bio_source=="biopsy_only"]
   	names(tmp) = subj_smry$patient_id[subj_smry$patient_id %in% ordered_patient_ids & subj_smry$bio_source=="biopsy_only"]
   	plt_smry[names(tmp),"biopsy_only"] = tmp
   	plt_smry[is.na(plt_smry)] = 0
   	screen(zz[i+3])
   	plot(1, 1, type="n", xlim=c(1,length(ordered_patient_ids)+5), ylim=c(35,0), xlab="", ylab="", axes=FALSE, frame.plot=FALSE)
   	zzz = barplot(plt_smry[,"biopsy_matched"]+plt_smry[,"biopsy_only"], space=.08, add=TRUE, axes=FALSE, ylab="", col=transparent_rgb(variant_cols["biopsy_only"], 155))
   	barplot(plt_smry[,"biopsy_matched"], space=.08, add=TRUE, axes=FALSE, ylab="", col=variant_cols["biopsy_matched"])
   	if (i==1) {
 		axis(2, at = seq(0,35,by=5), labels=c("", seq(5,35,by=5)), cex.axis = 1.95, las = 1, line = 0, lwd=2)
 		mtext(side = 2, text = "Number of variants", line = 6, cex = 1.95)
   	} else {
   		axis(2, at = seq(0,35,by=5), labels=rep("",8), cex.axis = 1.25, las = 1, line=0, lwd=2)
   	}
   	
   	screen(zz[i])
  	dot_cols = c("VUSo" = "#231F20", "biopsy_matched" = "#231F20", "IMPACT-BAM_matched" = "#231F20", "biopsy_only" = NA)
  	dot_shapes = c("VUSo" = 21, "biopsy_matched" = 21, "IMPACT-BAM_matched" = 25, "biopsy_only" = NA)
  	dot_bg = c(variant_cols[c("VUSo", "biopsy_matched", "IMPACT-BAM_matched")], "biopsy_only" = NA)
  	plot(1, 1, type="n", xlim=c(1,length(ordered_patient_ids)+5), ylim=c(.02, 100), log="y", xlab="", ylab="", axes=FALSE, frame.plot=FALSE)
  	for (j in 1:length(ordered_patient_ids)) {
  		points(rep(zzz[j,1],sum(subj_small_vars[,"patient_id"]==ordered_patient_ids[j])), subj_small_vars[subj_small_vars[,"patient_id"]==ordered_patient_ids[j],"af_nobaq",drop=TRUE],
    		   pch=dot_shapes[as.character(subj_small_vars[subj_small_vars[,"patient_id"]==ordered_patient_ids[j],"bio_source",drop=TRUE])],
    		   col=dot_cols[as.character(subj_small_vars[subj_small_vars[,"patient_id"]==ordered_patient_ids[j],"bio_source",drop=TRUE])],
    		   bg=dot_bg[as.character(subj_small_vars[subj_small_vars[,"patient_id"]==ordered_patient_ids[j],"bio_source",drop=TRUE])], lwd=.5)
    	}
    if (i==1) {
  		axis(2, at = c(.02,.1,.5,1,5,10,50,100), labels = c("0.02","0.1","0.5","1","5","10","50", "100"), cex.axis = 1.95, las = 1, line = 0, lwd=2)
  		mtext(side = 2, text = "VAF (%)", line = 6, cex = 1.95)
  	} else {
   	  	axis(2, at = c(.02,.1,.5,1,5,10,50,100), labels = rep("",8), cex.axis = 1.25, las = 1, line=0, lwd=2)
    }
    title(main=paste0("\n", cancer_types[i]), cex.main=1.85)
    
    export_x = plt_smry %>%
    		   mutate(`patient_id` = rownames(plt_smry)) %>%
    		   dplyr::select(`patient_id`,
    		   				 `biopsy_matched`,
    		   				 `biopsy_only`)
    export_y = subj_small_vars %>%
    		   dplyr::select(`patient_id` = `patient_id`,
    		   				 `bio_source` = `bio_source`,
    		   				 `af_cfdna` = `af_nobaq`)
    write_tsv(export_x, path=paste0("../res/etc/Source_Data_Fig_2/Fig_2d_1_", i, ".tsv"), append=FALSE, col_names=TRUE)
    write_tsv(export_y, path=paste0("../res/etc/Source_Data_Fig_2/Fig_2d_2_", i, ".tsv"), append=FALSE, col_names=TRUE)
}
close.screen(all.screens=TRUE)
dev.off()
 
pdf(file="../res/figure2/allele_frequency_combined_bis.pdf", width=28, height=10)
par(mar = c(6.1, 6, 4.1, 1))
zz = split.screen(figs=matrix(c(0+.05,1/3+.05,0.39,1, 1/3,2/3,0.39,1, 2/3-.05,1-.05,0.39,1, 0+.05,1/3+.05,0,.6, 1/3,2/3,0,.6, 2/3-.05,1-.05,0,.6, 0,1,0,1), nrow=7, ncol=4, byrow=TRUE))
cancer_types = c("Control")
for (i in 1:length(cancer_types)) {
	subj_small_vars = variants %>%
 					  filter(subj_type == cancer_types[i],
 					  bio_source %in% c("biopsy_matched", "VUSo", "biopsy_only"))
 
 	ordered_patient_ids = subj_small_vars %>%
 						  group_by(patient_id) %>%
 						  mutate(max_af = max(af_nobaq)) %>%
 						  dplyr::select(patient_id, max_af) %>%
 						  unique() %>%
 						  arrange(max_af) %>%
 						  dplyr::select(patient_id) %>%
 						  unlist()
   
 	if (cancer_types[i] == "Control") {
     	sample_ids = tracker_grail %>%
     				 filter(study == "Merlin", tissue == "Healthy") %>% .[["patient_id"]]
   	} else {
     	sample_ids = tracker_grail %>%
     				 filter(patient_id %in% unique(tracker_impact$patient_id)) %>%
     				 filter(study == "TechVal", tissue == cancer_types[i]) %>% .[["patient_id"]]
   	}
   	zero_ids = sample_ids[!sample_ids %in% ordered_patient_ids]
   
 	if (length(zero_ids) > 0) {
     	sample_id_table = data.frame(patient_id = zero_ids,
                                   	 bio_source = "VUSo",
                                   	 af_nobaq = NA)
     	subj_small_vars = bind_rows(sample_id_table, subj_small_vars %>%
                                    dplyr::select(patient_id, bio_source, af_nobaq))
    	ordered_patient_ids = c(zero_ids, ordered_patient_ids)
 	}
   	subj_small_vars = subj_small_vars %>%
     				  mutate(patient_id = factor(patient_id, levels = ordered_patient_ids),
         			  bio_source = factor(bio_source, levels = c("VUSo", "biopsy_matched", "biopsy_only")))
   
 	ordered_pat_list[[cancer_types[i]]] = ordered_patient_ids
 	
 	cancer_vars_tissue_ref = variants %>%
   						     filter(subj_type != "Control",
   						     bio_source %in% c("biopsy_matched", "biopsy_only", "VUSo"))
 	per_pat_smry = cancer_vars_tissue_ref %>%
   				   filter(bio_source != "VUSo") %>%
   				   group_by(subj_type, patient_id, bio_source) %>%
   				   dplyr::summarize(num_variants = n()) %>%
   				   ungroup()
 
 	subj_smry = per_pat_smry %>%
   			    filter(subj_type == cancer_types[i])
   			  
 	# add zero-mutation samples
 	pat_ids = unique(subj_smry$patient_id)
  	zero_ids = ordered_pat_list[[cancer_types[i]]][!ordered_pat_list[[cancer_types[i]]] %in% pat_ids]
   	if (length(zero_ids) != 0) {
   		zero_id_table = data.frame(subj_type = cancer_types[i],
     							   patient_id = zero_ids,
     							   bio_source = "biopsy_matched",
     							   num_variants = NA)
     	subj_smry = bind_rows(zero_id_table, subj_smry)
   	}
   
   	plt_smry = data.frame(matrix(0, nrow=length(ordered_patient_ids), ncol=2))
   	rownames(plt_smry) = ordered_patient_ids
   	colnames(plt_smry) = c("biopsy_matched","biopsy_only")
   	tmp = subj_smry$num_variants[subj_smry$patient_id %in% ordered_patient_ids & subj_smry$bio_source=="biopsy_matched"]
   	names(tmp) = subj_smry$patient_id[subj_smry$patient_id %in% ordered_patient_ids & subj_smry$bio_source=="biopsy_matched"]
   	plt_smry[names(tmp),"biopsy_matched"] = tmp
 	tmp = subj_smry$num_variants[subj_smry$patient_id %in% ordered_patient_ids & subj_smry$bio_source=="biopsy_only"]
   	names(tmp) = subj_smry$patient_id[subj_smry$patient_id %in% ordered_patient_ids & subj_smry$bio_source=="biopsy_only"]
   	plt_smry[names(tmp),"biopsy_only"] = tmp
   	plt_smry[is.na(plt_smry)] = 0
   	screen(zz[i+3])
   	plot(1, 1, type="n", xlim=c(1,length(ordered_patient_ids)+5), ylim=c(35,0), xlab="", ylab="", axes=FALSE, frame.plot=FALSE)
   	zzz = barplot(plt_smry[,"biopsy_matched"]+plt_smry[,"biopsy_only"], space=.08, add=TRUE, axes=FALSE, ylab="", col=transparent_rgb(variant_cols["biopsy_only"]), plot=FALSE)
   	
   	screen(zz[i])
 	dot_cols = c("VUSo" = "#231F20", "biopsy_matched" = "#231F20", "IMPACT-BAM_matched" = "#231F20", "biopsy_only" = NA)
 	dot_shapes = c("VUSo" = 21, "biopsy_matched" = 21, "IMPACT-BAM_matched" = 25, "biopsy_only" = NA)
 	dot_bg = c(variant_cols[c("VUSo", "biopsy_matched", "IMPACT-BAM_matched")], "biopsy_only" = NA)
 	plot(1, 1, type="n", xlim=c(1,length(ordered_patient_ids)+5), ylim=c(.02, 100), log="y", xlab="", ylab="", axes=FALSE, frame.plot=FALSE)
 	for (j in 1:length(ordered_patient_ids)) {
 		points(rep(zzz[j,1],sum(subj_small_vars[,"patient_id"]==ordered_patient_ids[j])), subj_small_vars[subj_small_vars[,"patient_id"]==ordered_patient_ids[j],"af_nobaq"],
   		   	   pch=dot_shapes[subj_small_vars[subj_small_vars[,"patient_id"]==ordered_patient_ids[j],"bio_source"]],
   		   	   col=dot_cols[subj_small_vars[subj_small_vars[,"patient_id"]==ordered_patient_ids[j],"bio_source"]],
   		   	   bg=dot_bg[as.character(subj_small_vars[subj_small_vars[,"patient_id"]==ordered_patient_ids[j],"bio_source",drop=TRUE])], lwd=.5)
   	}
   	if (i==1) {
 		axis(2, at = c(.02,.1,.5,1,5,10,50,100), labels = c("0.02","0.1","0.5","1","5","10","50", "100"), cex.axis = 1.95, las = 1, line = 0, lwd=2)
 		mtext(side = 2, text = "VAF (%)", line = 6, cex = 1.95)
 		axis(1, at = zzz[,1], labels=rep("",length(ordered_patient_ids)))
 	} else {
   	  	axis(2, at = c(.02,.1,.5,1,5,10,50,100), labels = rep("",8), cex.axis = 1.25, las = 1, line=0, lwd=2)
    }
    title(main=paste0("\n", cancer_types[i]), cex.main=1.85)
    
    export_x = subj_small_vars %>%
    		   dplyr::select(`patient_id` = `patient_id`,
    		   				 `bio_source` = `bio_source`,
    		   				 `af_cfdna` = `af_nobaq`)
    write_tsv(export_x, path="../res/etc/Source_Data_Fig_2/Fig_2c.tsv", append=FALSE, col_names=TRUE)
}
close.screen(all.screens=TRUE)
dev.off()

#==================================================
# violin plot of ctdna fraction by cancer type
#==================================================
cfdna_frac = read.csv(file=url_ctdna_frac, header=TRUE, sep=",", stringsAsFactors=FALSE) %>%
			 filter(!is.na(ctdna_frac)) %>%
			 mutate(index = order_samples(ID)) %>%
 			 arrange(desc(ctdna_frac)) %>%
			 arrange(index)
cancer_types = c("Breast", "Lung", "Prostate")

pdf(file="../res/figure2/cfdna_fraction_by_cancer_type.pdf", width=5.5, height=7)
par(mar = c(6.1, 6, 4.1, 1))
plot(0, 0, type="n", xlim=c(0.5,3.5), ylim=c(0.01,1), axes=FALSE, frame.plot=FALSE, xlab="", ylab="")
beeswarm(cfdna_frac[cfdna_frac[,3]==1,2], method="swarm", col=cohort_cols["Breast"], add=TRUE, pch=19, at=1, cex=1.75, corral="random", corralWidth=.9)
beeswarm(cfdna_frac[cfdna_frac[,3]==2,2], method="swarm", col=cohort_cols["Lung"], add=TRUE, at=2, pch=19, cex=1.75, corral="random", corralWidth=.85)
beeswarm(cfdna_frac[cfdna_frac[,3]==3,2], method="swarm", col=cohort_cols["Prostate"], add=TRUE, at=3, pch=19, cex=1.75, corral="random", corralWidth=.8)
axis(1, at = 1:3, labels=cancer_types, cex.axis = 1.5, padj = 0.25)
axis(2, at = NULL, cex.axis = 1.5, las = 1)
text(x=2, y=.995, "p = 0.0046" , cex=1.05)
box()
mtext(side = 2, text = "ctDNA fraction", line = 4, cex = 1.5)
dev.off()

pdf(file="../res/figure2/cfdna_fraction_by_cancer_type_bis.pdf", width=5.5, height=7)
par(mar = c(6.1, 6, 4.1, 1))
plot(0, 0, type="n", xlim=c(0.5,3.5), ylim=c(0.01,1), axes=FALSE, frame.plot=FALSE, xlab="", ylab="")
vioplot(cfdna_frac[cfdna_frac[,3]==1,2], at = 1, add = TRUE, col=cohort_cols["Breast"], border = "black", rectCol = "black", lineCol = "black", pchMed = 19, colMed = "white")
vioplot(cfdna_frac[cfdna_frac[,3]==2,2], at = 2, add = TRUE, col=cohort_cols["Lung"], border = "black", rectCol = "black", lineCol = "black", pchMed = 19, colMed = "white")
vioplot(cfdna_frac[cfdna_frac[,3]==3,2], at = 3, add = TRUE, col=cohort_cols["Prostate"], border = "black", rectCol = "black", lineCol = "black", pchMed = 19, colMed = "white")
axis(1, at = 1:3, labels=cancer_types, cex.axis = 1.5, padj = 0.25)
axis(2, at = NULL, cex.axis = 1.5, las = 1)
text(x=2, y=.995, "p = 0.0046" , cex=1.05)
mtext(side = 2, text = "ctDNA fraction", line = 4, cex = 1.5)
dev.off()

export_x = cfdna_frac %>%
		   dplyr::select(`patient_id` = `ID`,
		   				 `tissue` = `index`,
		   				 `ctdna_fraction` = `ctdna_frac`) %>%
		   mutate(tissue = case_when(
		   			tissue == 1 ~ "Breast",
		   			tissue == 2 ~ "Lung",
		   			tissue == 3 ~ "Prostate"))
write_tsv(export_x, path="../res/etc/Source_Data_Fig_2/Fig_2f.tsv", append=FALSE, col_names=TRUE)
 
#==================================================
# box plot of ctdna fraction by
# number of metastatic sites
#==================================================
clinical = read_tsv(file=clinical_file_updated, col_types = cols(.default = col_character())) %>%
		   type_convert() %>%
		   mutate(subj_type = ifelse(subj_type == "Healthy", "Control", subj_type))
cfdna_frac = read.csv(file=url_ctdna_frac, header=TRUE, sep=",", stringsAsFactors=FALSE) %>%
			 filter(!is.na(ctdna_frac)) %>%
			 mutate(index = order_samples(ID)) %>%
 			 arrange(desc(ctdna_frac)) %>%
			 arrange(index)
number_metastatic_sites = unlist(lapply(strsplit(clinical[,"metastatic_sites",drop=TRUE], split=",", fixed=TRUE), function(x) {length(x)}))
number_metastatic_sites[is.na(clinical[,"metastatic_sites",drop=TRUE])] = 0
names(number_metastatic_sites) = clinical[,"patient_id",drop=TRUE]
number_metastatic_sites = number_metastatic_sites[cfdna_frac[,1]]

x = list(cfdna_frac[number_metastatic_sites==1 | number_metastatic_sites==2,2],
		 cfdna_frac[number_metastatic_sites==3,2],
		 cfdna_frac[number_metastatic_sites>4,2])
y = list(cfdna_frac[number_metastatic_sites==1 | number_metastatic_sites==2,1],
		 cfdna_frac[number_metastatic_sites==3,1],
		 cfdna_frac[number_metastatic_sites>4,1])
for (i in 1:3) {
	y[[i]][grep("VB", y[[i]])] = "Breast"
	y[[i]][grep("VL", y[[i]])] = "Lung"
	y[[i]][grep("VP", y[[i]])] = "Prostate"
}
z = list()
for (i in 1:3) {
	z[[i]] = rep(i, length(x[[i]]))
}

tmp.0 = data_frame(
			ctdna = as.numeric(unlist(x)),
			tissue = factor(unlist(y), levels=c("Breast", "Lung", "Prostate"), ordered=TRUE),
			cat3 = factor(unlist(z), levels=c("1", "2", "3"), ordered=TRUE))
				 
plot.0 = ggplot(tmp.0, aes(x = tissue, y = ctdna, color = cat3, group=interaction(tissue, cat3))) + 
	     geom_boxplot(alpha=1, outlier.size=NA, outlier.shape=NA, fill="white", width=.8) +
	     scale_colour_manual(values=c("1"="black", "2"="black", "3"="black")) +
	     geom_jitter(
		 	aes(x = tissue, y = ctdna, group = cat3, fill=tissue),
  		    position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8),
  		    alpha=1, size=3, shape=21,
  		 ) +
  		 scale_fill_manual(values=cohort_cols) +
	     theme_classic(base_size=15) +
 	     theme(axis.text.y = element_text(size=13), axis.text.x = element_text(size=10), legend.position="none") +
 	     labs(x="", y="ctDNA fraction\n") +
 	     coord_cartesian(ylim = c(0,1))
 	     
pdf(file="../res/figure2/cfdna_fraction_by_number_of_site.pdf", width=8.5, height=5)
print(plot.0)
dev.off()

#==================================================
# box plot of ctdna fraction by
# disease volume
#==================================================
ctdna_fraction = read_csv(file=url_ctdna_frac, col_types = cols(.default = col_character()))  %>%
 				 type_convert() %>%
 				 rename(GRAIL_ID = ID)

vol_breast = read.xls(xls=url_volumetric_data, sheet=1, stringsAsFactors=FALSE) %>%
			 type_convert() %>%
			 mutate(Date_Tissue = as.character(Date_Tissue)) %>%
			 mutate(Date_Most_Recent_Scan = as.character(Date_Most_Recent_Scan)) %>%
			 mutate(Date_of_Last_Treatment = as.character(Date_of_Last_Treatment))

vol_lung = read.xls(xls=url_volumetric_data, sheet=2, stringsAsFactors=FALSE) %>%
		   type_convert() %>%
		   mutate(Date_Tissue = as.character(Date_Tissue)) %>%
		   mutate(Date_Most_Recent_Scan = as.character(Date_Most_Recent_Scan)) %>%
		   mutate(Date_of_Last_Treatment = as.character(Date_of_Last_Treatment)) %>%
		   mutate(Date_Dx = as.character(Date_Dx)) %>%
		   mutate(Date_Metastatic = as.character(Date_Metastatic)) %>%
		   mutate(Date_Tissue = as.character(Date_Tissue))
		   
vol_prostate = read.xls(xls=url_volumetric_data, sheet=3, stringsAsFactors=FALSE) %>%
		       type_convert() %>%
		       mutate(BSI_Value = ifelse(BSI_Value==">13", 14, BSI_Value)) %>%
		       mutate(BSI_Value = ifelse(BSI_Value=="n/a", NA, BSI_Value)) %>%
		       mutate(BSI_Value = as.numeric(BSI_Value)) %>%
		       rename(BSI_Volume = BSI_Value) %>%
		       mutate(Date_Dx = as.character(Date_Dx)) %>%
		       mutate(Date_Metastatic = as.character(Date_Metastatic)) %>%
		       mutate(Date_Tissue = as.character(Date_Tissue))

volumetric_data = bind_rows(vol_breast, vol_lung) %>%
				  bind_rows(vol_prostate) %>%
				  left_join(ctdna_fraction, by="GRAIL_ID")
				  
volumetric_data = volumetric_data %>%
				  mutate(tot_volu = apply( volumetric_data[,which(grepl('volume', colnames(volumetric_data), ignore.case = T))] , 1 , function(x) sum(as.numeric(x), na.rm=T) )) %>%
				  mutate(tot_mets = apply( volumetric_data[,which(!grepl('volume', colnames(volumetric_data), ignore.case = T) & grepl('Lesions|Lymph_Nodes|Pleural_Disease', colnames(volumetric_data), ignore.case = T))] , 1 , function(x) sum(as.numeric(x), na.rm=T))) %>%
				  filter(!(GRAIL_ID %in% c('MSK-VL-0057','MSK-VL-0003','MSK-VB-0046')))
				  
pattern = c("VB", "VL", "VP")
tissue = c("Breast", "Lung", "Prostate")

tmp.1 = p.1 = NULL
for (i in 1:length(pattern)) {
	tmp = volumetric_data %>%
		  mutate(tot_volu = ifelse(tot_volu==0, 0.1, tot_volu)) %>%
	 	  mutate(ln_frac = log(ctdna_frac)) %>%
	 	  mutate(ln_vol = log(tot_volu)) %>%
		  filter(grepl(pattern[i], GRAIL_ID)) %>%
		  mutate(Tissue = tissue[i]) %>%
		  filter(!(is.infinite(ln_frac) | is.na(ln_frac))) %>%
		  filter(!(is.infinite(ln_vol) | is.na(ln_vol)))
		
	if (tissue[i]=="Breast") {
		tmp = tmp %>%
			  mutate(shape = ifelse(is.na(Bone_Lesions) & is.na(Pleural_Disease), 21, 24))
	} else if (tissue[i]=="Lung") {
		tmp = tmp %>%
			  mutate(shape = ifelse(is.na(Bone_Lesions) & is.na(Pleural_Disease), 21, 24)) %>%
	  		  # Large left forearm bone and soft-tissue mass 10 by 7 by 17 cm not included in PET-scan and measurements
	  		  mutate(shape = ifelse(GRAIL_ID=="MSK-VL-0008", 24, shape))
	} else if (tissue[i]=="Prostate") {
		tmp = tmp %>%
			  mutate(shape = 24)
	}
	

	tmp.0 = tmp %>%
			mutate(cat = case_when(
				tot_volu >= 0 & tot_volu <= quantile(tot_volu, 1/3) ~ 1,
				tot_volu > quantile(tot_volu, 1/3) & tot_volu <= quantile(tot_volu, 2/3) ~ 2,
				tot_volu > quantile(tot_volu, 2/3) ~ 3,
			)) %>%
 			mutate(cat = as.factor(cat))
 	p = signif(jonckheere.test(x=tmp.0$ctdna_frac, g=as.numeric(tmp.0$cat), alternative = "increasing")$p.value, 3)
 	tmp.1 = rbind(tmp.1, tmp.0)
 	p.1 = c(p.1, p)
}

plot.0 = ggplot(tmp.1, aes(x = Tissue, y = ctdna_frac, color = cat, group=interaction(Tissue, cat))) + 
		 geom_boxplot(alpha=1, outlier.size=NA, outlier.shape=NA, fill="white", width=.8) +
		 scale_colour_manual(values=c("1"="black", "2"="black", "3"="black")) +
		 geom_jitter(
		 	aes(x = Tissue, y = ctdna_frac, fill = factor(Tissue), shape = factor(shape)),
  			position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8),
  			alpha=1, size=3
  			) +
  		 scale_shape_manual(values=c("21"=21, "24"=24)) +
  		 scale_fill_manual(values=cohort_cols) +
		 theme_classic(base_size=15) +
 		 theme(axis.text.y = element_text(size=13), axis.text.x = element_text(size=10), legend.position="none") +
 		 labs(x="", y="ctDNA fraction\n") +
 		 coord_cartesian(ylim = c(0,1))

pdf(file="../res/figure2/cfdna_fraction_by_total_dx_vol.pdf", width=8.5, height=5)
print(plot.0)
dev.off()

export_x = tmp.1 %>%
		   dplyr::select(`patient_id` = `GRAIL_ID`,
		   				 `tissue` = `Tissue`,
		   				 `ctdna_fraction` = `ctdna_frac`,
		   				 `tertiles_disease_volume` = `cat`)
write_tsv(export_x, path="../res/etc/Source_Data_Fig_2/Fig_2g.tsv", append=FALSE, col_names=TRUE)

#==================================================
# trend test ctdna fraction by cancer type
#==================================================
x = cfdna_frac[,"ctdna_frac"]
y = rep(1, length(x))
y[grepl("VL", cfdna_frac[,1])] = 2
y[grepl("VP", cfdna_frac[,1])] = 3
 
pander(cor.test(x=x, y=y, method="kendall", alternative="two.sided", exact=FALSE)$p.value)
pander(kruskal.test(x=x, g=y)$p.value)


#==================================================
# trend test ctdna fraction versus number of sites
#==================================================
clinical = read_tsv(file=clinical_file_updated, col_types = cols(.default = col_character())) %>%
		   type_convert() %>%
		   mutate(subj_type = ifelse(subj_type == "Healthy", "Control", subj_type))
cfdna_frac = read.csv(file=url_ctdna_frac, header=TRUE, sep=",", stringsAsFactors=FALSE) %>%
			 filter(!is.na(ctdna_frac)) %>%
			 mutate(index = order_samples(ID)) %>%
 			 arrange(desc(ctdna_frac)) %>%
			 arrange(index)
number_metastatic_sites = unlist(lapply(strsplit(clinical[,"metastatic_sites",drop=TRUE], split=",", fixed=TRUE), function(x) {length(x)}))
number_metastatic_sites[is.na(clinical[,"metastatic_sites",drop=TRUE])] = 0
names(number_metastatic_sites) = clinical[,"patient_id",drop=TRUE]
number_metastatic_sites = number_metastatic_sites[cfdna_frac[,1]]

x = cfdna_frac[,"ctdna_frac"]
y = number_metastatic_sites

pander(cor.test(x=x, y=y, method="kendall", alternative="greater", exact=FALSE)$p.value)
pander(jonckheere.test(x=x, g=y, alternative="increasing", nperm=10000)$p.value)

# jonckheere-terpstra test by cancer type
for (i in c("VB", "VL", "VP")) {
  	index = grepl(i, names(number_metastatic_sites))
  	x = cfdna_frac[index,"ctdna_frac"]
  	y = number_metastatic_sites[index]
  	pander(jonckheere.test(x=x, g=y, alternative="increasing", nperm=10000)$p.value)
}
for (i in c("VB", "VL", "VP")) {
  	index = grepl(i, names(number_metastatic_sites))
  	x = cfdna_frac[index,"ctdna_frac"]
  	y = number_metastatic_sites[index]
  	y0 = rep(1, length(y))
  	y0[y==3] = 2
  	y0[y>3] = 3
 	pander(jonckheere.test(x=x, g=y0, alternative="increasing", nperm=10000)$p.value)
}
 
# kendall correlation test by cancer type
for (i in c("VB", "VL", "VP")) {
 	index = grepl(i, names(number_metastatic_sites))
 	x = cfdna_frac[index,"ctdna_frac"]
 	y = number_metastatic_sites[index]
  	pander(cor.test(x=x, y=y, method="kendall", alternative="greater", exact = FALSE)$p.value)
}
for (i in c("VB", "VL", "VP")) {
 	index = grepl(i, names(number_metastatic_sites))
 	x = cfdna_frac[index,"ctdna_frac"]
 	y = number_metastatic_sites[index]
 	y0 = rep(1, length(y))
 	y0[y==3] = 2
  	y0[y>3] = 3
 	pander(cor.test(x=x, y=y0, method="kendall", alternative="greater", exact = FALSE)$p.value)
}
 
# mblm association test by cancer type
for (i in c("VB", "VL", "VP")) {
 	index = grepl(i, names(number_metastatic_sites))
 	x = cfdna_frac[index,"ctdna_frac"]
 	y = number_metastatic_sites[index]
	data = data.frame(ctDNA_fraction=x, n_sites=y) 
 
 	z = mblm(ctDNA_fraction ~ n_sites, dataframe=data, repeated=FALSE)
 	pander(z)
}
  	
#==================================================
# p-values of detection rate
#==================================================
N = c(39, 41, 44)
n = c(37, 31, 36)
 
# Breast versus lung
m1 = matrix(NA, 2, 2)
m1[1,1] = N[1]-n[1]
m1[2,1] = N[2]-n[2]
m1[1,2] = n[1]
m1[2,2] = n[2]
p1 = fisher.test(m1)$p.value
   
# Breast versus prostate
m2 = matrix(NA, 2, 2)
m2[1,1] = N[1]-n[1]
m2[2,1] = N[3]-n[3]
m2[1,2] = n[1]
m2[2,2] = n[3]
p2 = fisher.test(m2)$p.value
   
# Lung versus prostate
m3 = matrix(NA, 2, 2)
m3[1,1] = N[2]-n[2]
m3[2,1] = N[3]-n[3]
m3[1,2] = n[2]
m3[2,2] = n[3]
p3 = fisher.test(m3)$p.value
   
p = p.adjust(c(p1, p2, p3), method="bonferroni")
pander(p)


#==================================================
# cfDNA detection rate by tumor CCF
#==================================================
tumor_vars = read_tsv(file="../modified_v11/Resources/MSK_additional_data/20180405_table_1f_grail_paper_filters.called_in_both_and_impact.tsv",
			 col_types = cols(.default = col_character())) %>%
		     type_convert()
tumor_vars = tumor_vars %>%
			 mutate("tissue" = case_when(
			 	COHORT == "validation_breast" ~ "Breast",
			 	COHORT == "validation_lung" ~ "Lung",
			 	COHORT == "validation_prostate" ~ "Prostate"
			 )) %>%
			 mutate(is_snv = ifelse(nchar(ALT)==1 & nchar(REF)==1, TRUE, FALSE)) %>%
			 filter(is_snv)

cohort = c("Breast", "Lung", "Prostate")
p = rep(NA, length(cohort))
names(p) = cohort
dfm = list()
for (i in 1:length(cohort)) {
	tmp_vars = tumor_vars %>%
			   filter(tissue == cohort[i] & CALLED_IN %in% c("both","impact"))
	ccf = tmp_vars %>%
		  .[['IM-T.ccf']]
	ccf_cat = cut(ccf, c(0,0.5,0.8,1))
	called_in = tmp_vars %>%
		  	    .[['CALLED_IN']]
	dr_sum = by(called_in, ccf_cat, function(x) { sum(x=="both") })
	tt_sum = table(ccf_cat)
	dr = dr_sum/tt_sum
	ci = binconf(x=dr_sum, n=tt_sum)
	p[i] = prop.trend.test(dr_sum, tt_sum)$p.value

	x = c(sum(tumor_vars$`IM-T.clonality` %in% c("clonal","likely_clonal") & tumor_vars$CALLED_IN=="both"),
		  sum(tumor_vars$`IM-T.clonality` %in% c("clonal","likely_clonal") & tumor_vars$`IM-T.clonality`!=""),
		  sum(tumor_vars$`IM-T.clonality` %in% c("subclonal") & tumor_vars$CALLED_IN=="both"),
		  sum(tumor_vars$`IM-T.clonality` %in% c("subclonal") & tumor_vars$`IM-T.clonality`!=""))

	dfm[[i]] = data.frame(intervals = names(tt_sum),
					 	  dr = as.vector(dr),
					 	  lci = as.vector(ci[,2]),
					 	  uci = as.vector(ci[,3])) %>%
		  				  mutate(tissue = cohort[i])
}
dfm = do.call(rbind, dfm)

export_x = tumor_vars %>%
		   dplyr::select(
		   		`patient_id` = `CASE`,
		   		`tissue` = `tissue`,
		   		`gene_id` = `GENE`,
		   		`chromosome` = `CHROM`,
		   		`position` = `POS`,
		   		`called_in` = `CALLED_IN`,
		   		`t_ccf` = `IM-T.ccf`,
		   		`t_clonality` = `IM-T.clonality`) %>%
		   filter(called_in %in% c("both", "impact"))
write_tsv(export_x, path="../res/etc/Source_Data_Fig_2/Fig_2e.tsv", append=FALSE, col_names=TRUE)
ndbrown6/MSK-GRAIL-TECHVAL documentation built on March 29, 2020, 4:41 p.m.