R/tardbpdms_quality_control.R

Defines functions tardbpdms_quality_control

Documented in tardbpdms_quality_control

#' tardbpdms_quality_control
#'
#' Quality control plots including scatterplots comparing toxicity estimates between replicates.
#'
#' @param toxicity_list named list of folder paths with toxicity estimates (required)
#' @param outpath output path for plots and saved objects (required)
#' @param colour_scheme colour scheme file (required)
#' @param execute whether or not to execute the analysis (default: TRUE)
#'
#' @return Nothing
#' @export
#' @import data.table
tardbpdms_quality_control <- function(
  toxicity_list,
  outpath,
  colour_scheme,
  execute = TRUE
  ){

	#Return previous results if analysis not executed
	if(!execute){
		return()
	}

  #Display status
  message(paste("\n\n*******", "running stage: tardbpdms_quality_control", "*******\n\n"))

	#Create output directory
	tardbpdms__create_dir(tardbpdms_dir = outpath)
	
	### Load data
	###########################

	#DMS variant data (for individual replicates)
	dms_dt1 <- tardbpdms__load_toxicity(toxicity_list[[1]], object=T, read_filter=F)
	dms_dt2 <- tardbpdms__load_toxicity(toxicity_list[[2]], object=T, read_filter=F)

	#Rename column names consistently
	names(dms_dt1)[grep('toxicity[1-4]$', names(dms_dt1))] <- paste0("toxicity", 1:length(grep('toxicity[1-4]$', names(dms_dt1))))
	names(dms_dt2)[grep('toxicity[1-4]$', names(dms_dt2))] <- paste0("toxicity", 1:length(grep('toxicity[1-4]$', names(dms_dt2))))
	names(dms_dt1)[grep('toxicity[1-4]_uncorr$', names(dms_dt1))] <- paste0("toxicity", 1:length(grep('toxicity[1-4]_uncorr$', names(dms_dt1))), "_uncorr")
	names(dms_dt2)[grep('toxicity[1-4]_uncorr$', names(dms_dt2))] <- paste0("toxicity", 1:length(grep('toxicity[1-4]_uncorr$', names(dms_dt2))), "_uncorr")
	names(dms_dt1)[grep('toxicity[1-4]_cond$', names(dms_dt1))] <- paste0("toxicity", 1:length(grep('toxicity[1-4]_cond$', names(dms_dt1))), "_cond")
	names(dms_dt2)[grep('toxicity[1-4]_cond$', names(dms_dt2))] <- paste0("toxicity", 1:length(grep('toxicity[1-4]_cond$', names(dms_dt2))), "_cond")
	names(dms_dt1)[grep('sigma[1-4]$', names(dms_dt1))] <- paste0("sigma", 1:length(grep('sigma[1-4]$', names(dms_dt1))))
	names(dms_dt2)[grep('sigma[1-4]$', names(dms_dt2))] <- paste0("sigma", 1:length(grep('sigma[1-4]$', names(dms_dt2))))
	names(dms_dt1)[grep('sigma[1-4]_uncorr$', names(dms_dt1))] <- paste0("sigma", 1:length(grep('sigma[1-4]_uncorr$', names(dms_dt1))), "_uncorr")
	names(dms_dt2)[grep('sigma[1-4]_uncorr$', names(dms_dt2))] <- paste0("sigma", 1:length(grep('sigma[1-4]_uncorr$', names(dms_dt2))), "_uncorr")
	names(dms_dt1)[grep('sigma[1-4]_cond$', names(dms_dt1))] <- paste0("sigma", 1:length(grep('sigma[1-4]_cond$', names(dms_dt1))), "_cond")
	names(dms_dt2)[grep('sigma[1-4]_cond$', names(dms_dt2))] <- paste0("sigma", 1:length(grep('sigma[1-4]_cond$', names(dms_dt2))), "_cond")

	#Combine regions for supplemental table
	dms_dt1[, region := names(toxicity_list)[1]]
	dms_dt2[, region := names(toxicity_list)[2]]
	dms_dt <- rbind(
		dms_dt1[,.SD,,.SDcols = c("Pos", "WT_AA", "Mut", "Nmut_nt", "Nmut_codons", "sigma", "toxicity", "toxicity_cond",
			"Pos1", "Pos2", "WT_AA1", "WT_AA2", "Mut1", "Mut2", "Nmut_aa", "region", "STOP", "mean_count", "is.reads0",
			names(dms_dt1)[grep('toxicity[1-4]$', names(dms_dt1))], names(dms_dt1)[grep('toxicity[1-4]_uncorr$', names(dms_dt1))], names(dms_dt1)[grep('toxicity[1-4]_cond$', names(dms_dt1))],
			names(dms_dt1)[grep('sigma[1-4]$', names(dms_dt1))], names(dms_dt1)[grep('sigma[1-4]_uncorr$', names(dms_dt1))], names(dms_dt1)[grep('sigma[1-4]_cond$', names(dms_dt1))])], 
		dms_dt2[,.SD,,.SDcols = c("Pos", "WT_AA", "Mut", "Nmut_nt", "Nmut_codons", "sigma", "toxicity", "toxicity_cond", 
			"Pos1", "Pos2", "WT_AA1", "WT_AA2", "Mut1", "Mut2", "Nmut_aa", "region", "STOP", "mean_count", "is.reads0",
			names(dms_dt2)[grep('toxicity[1-4]$', names(dms_dt2))], names(dms_dt2)[grep('toxicity[1-4]_uncorr$', names(dms_dt2))], names(dms_dt2)[grep('toxicity[1-4]_cond$', names(dms_dt2))],
			names(dms_dt2)[grep('sigma[1-4]$', names(dms_dt2))], names(dms_dt2)[grep('sigma[1-4]_uncorr$', names(dms_dt2))], names(dms_dt2)[grep('sigma[1-4]_cond$', names(dms_dt2))])])
	#Remove missing toxicity values
	dms_dt <- dms_dt[!is.na(toxicity) | !is.na(toxicity_cond),]
	#Absolute position (singles)
	dms_dt[, Pos_abs := Pos+as.numeric(region)-1]
	#Absolute position (doubles)
	dms_dt[, Pos_abs1 := Pos1+as.numeric(region)-1]
	dms_dt[, Pos_abs2 := Pos2+as.numeric(region)-1]
	#Mutation code (singles)
	dms_dt[, mut_code := paste0(WT_AA, Pos_abs, Mut)]
	#Mutation code (doubles)
	dms_dt[, mut_code1 := paste0(WT_AA1, Pos_abs1, Mut1)]
	dms_dt[, mut_code2 := paste0(WT_AA2, Pos_abs2, Mut2)]
	#Supplementary data file
	fwrite(dms_dt[is.reads0==T,], file = file.path(outpath, "supplementary_table_raw_toxicity_estimates_all.txt"), sep = "\t")

	#Combine regions for QC
	dms_dt <- rbind(
		dms_dt1[,.SD,,.SDcols = c("Nmut_aa", "region", "toxicity", "toxicity_cond", "toxicity_uncorr", "STOP", "mean_count", "is.reads0",
			names(dms_dt1)[grep('toxicity[1-4]$', names(dms_dt1))], names(dms_dt1)[grep('toxicity[1-4]_cond$', names(dms_dt1))])], 
		dms_dt2[,.SD,,.SDcols = c("Nmut_aa", "region", "toxicity", "toxicity_cond", "toxicity_uncorr", "STOP", "mean_count", "is.reads0",
			names(dms_dt2)[grep('toxicity[1-4]$', names(dms_dt2))], names(dms_dt2)[grep('toxicity[1-4]_cond$', names(dms_dt2))])])

	### Mean counts vs uncorrected toxicity
	###########################

	#Set uncorrected toxicity to toxicity for singles
	dms_dt[Nmut_aa==1, toxicity_uncorr := toxicity]
	#Set conditional toxicity to toxicity for singles
	dms_dt[Nmut_aa==1, toxicity_cond := toxicity]
	plot_df <- reshape2::melt(as.data.frame(dms_dt[,.(mean_count, toxicity_uncorr, toxicity_cond, region)]), id = c("mean_count", "region"))
	d <- ggplot2::ggplot(plot_df, ggplot2::aes(mean_count, value)) +
		ggplot2::scale_x_log10() +
    ggplot2::stat_binhex(bins=30) +
	  ggplot2::xlab("Mean input read count") +
	  ggplot2::ylab("Toxicity") +
	  ggplot2::geom_hline(yintercept = 0, linetype=2) +
	  ggplot2::geom_vline(xintercept = 10, linetype=2, colour = colour_scheme[["shade 0"]][[1]]) +
	  ggplot2::theme_bw() +
	  ggplot2::facet_grid(variable ~ region)
	suppressWarnings(suppressMessages(ggplot2::ggsave(file=file.path(outpath, 'toxicity_vs_mean_input_read_count.pdf'), width=5, height=4)))

	### Scatterplots comparing toxicity estimates between replicates
	###########################

	#Singles, doubles and STOPs mutants with defined toxicity
	dms_dt <- dms_dt[is.reads0==T & (Nmut_aa==1 | Nmut_aa==2) & (!is.na(toxicity) | !is.na(toxicity_cond))]
	# dms_dt <- dms_dt[(Nmut_aa==1 | Nmut_aa==2) & (!is.na(toxicity) | !is.na(toxicity_cond))]
	dms_dt[Nmut_aa==2, toxicity1 := toxicity1_cond]
	dms_dt[Nmut_aa==2, toxicity2 := toxicity2_cond]
	dms_dt[Nmut_aa==2, toxicity3 := toxicity3_cond]
	# dms_dt[Nmut_aa==2, toxicity4 := toxicity4_cond]

	#Convert infinite toxicity values to NA
	dms_dt[is.infinite(toxicity1), toxicity1 := NA]
	dms_dt[is.infinite(toxicity2), toxicity2 := NA]
	dms_dt[is.infinite(toxicity3), toxicity3 := NA]
	# dms_dt[is.infinite(toxicity4), toxicity4 := NA]

	#Number of variants with defined toxicity values in each replicate
	print(paste0("All AA variants (single, double AA substitutions): ", dms_dt[,.N], " (", paste0(unlist(dms_dt[,.N,Nmut_aa][,2]), collapse = ", "), ")"))
	print(paste0("Rep1 AA variants (single, double AA substitutions): ", dms_dt[!is.na(toxicity1),.N], " (",paste0(unlist(dms_dt[!is.na(toxicity1),.N,Nmut_aa][,2]), collapse = ", "), ")"))
	print(paste0("Rep2 AA variants (single, double AA substitutions): ", dms_dt[!is.na(toxicity2),.N], " (",paste0(unlist(dms_dt[!is.na(toxicity2),.N,Nmut_aa][,2]), collapse = ", "), ")"))
	print(paste0("Rep3 AA variants (single, double AA substitutions): ", dms_dt[!is.na(toxicity3),.N], " (",paste0(unlist(dms_dt[!is.na(toxicity3),.N,Nmut_aa][,2]), collapse = ", "), ")"))
	# dms_dt[!is.na(toxicity4),.N]

	#Plot replicate toxicity correlation
	num_rep = 3
	# num_rep = 4
	tardbpdms__ggpairs_binhex(
		input_dt = dms_dt[Nmut_aa %in% c(1,2) & region=="290",.SD,.SDcols = paste0('toxicity', c(1:num_rep))], 
		# input_dt_upper = dms_dt[Nmut_aa==1,.SD,.SDcols = paste0('toxicity', c(1:num_rep))],
		output_file = file.path(outpath, "replicate_scatter_binhex_290.pdf"),
		xlab = "Toxicity",
		ylab = "Toxicity",
		width = 5,
		height = 5,
		label_size = 2,
  	plot_colours = c(colour_scheme[["shade 0"]][[1]], colour_scheme[["shade 0"]][[2]], colour_scheme[["shade 0"]][[3]]),
  	colour_limits = c(0, 100))
	tardbpdms__ggpairs_binhex(
		input_dt = dms_dt[Nmut_aa %in% c(1,2) & region=="332",.SD,.SDcols = paste0('toxicity', c(1:num_rep))], 
		# input_dt_upper = dms_dt[Nmut_aa==1,.SD,.SDcols = paste0('toxicity', c(1:num_rep))],
		output_file = file.path(outpath, "replicate_scatter_binhex_332.pdf"),
		xlab = "Toxicity",
		ylab = "Toxicity",
		width = 5,
		height = 5,
		label_size = 2,
  	plot_colours = c(colour_scheme[["shade 0"]][[1]], colour_scheme[["shade 0"]][[2]], colour_scheme[["shade 0"]][[3]]),
  	colour_limits = c(0, 100))

	#Plot replicate 1 and 2 correlation
	plot_dt <- copy(dms_dt)[Nmut_aa %in% c(1,2) & !is.na(toxicity1) & !is.na(toxicity2)]
  plot_colours = c(colour_scheme[["shade 0"]][[1]], colour_scheme[["shade 0"]][[2]], colour_scheme[["shade 0"]][[3]])
  temp_cor <- plyr::ddply(plot_dt, c("region"), plyr::summarize, cor = round(cor(toxicity1, toxicity2, use = "pairwise.complete.obs"), 2), n = length(toxicity1))
  d <- ggplot2::ggplot(plot_dt, ggplot2::aes(toxicity1, toxicity2)) +
  	ggplot2::stat_binhex(bins=50) +
  	ggplot2::geom_abline(linetype = 2) + 
	  ggplot2::geom_text(data = temp_cor, ggplot2::aes(label = paste0("R = ", cor, "\nn = ", n)), x = 0, y = 0, size = 2) +
    ggplot2::scale_fill_gradientn(colours=plot_colours,name = "Frequency", na.value=plot_colours[length(plot_colours)], limits=c(0, 100)) +
    ggplot2::ylab("Toxicity (rep2)") +
    ggplot2::xlab("Toxicity (rep1)") +
    ggplot2::theme_classic() + 
    ggplot2::facet_grid(region ~.)
  ggplot2::ggsave(file=file.path(outpath, "replicate1_replicate2_scatter.pdf"), width=5, height=5)

	### Scale toxicity by median standard deviation of AA mutants in each library
	###########################

	tardbpdms__scale_toxicity(
	  toxicity_path=toxicity_list[[1]],
	  outpath=file.path(outpath, names(toxicity_list)[1]))

	tardbpdms__scale_toxicity(
	  toxicity_path=toxicity_list[[2]],
	  outpath=file.path(outpath, names(toxicity_list)[2]))

	### Load data
	###########################

	#DMS variant data (for individual replicates)
	dms_dt1 <- tardbpdms__load_toxicity(file.path(outpath, names(toxicity_list)[1]), object=T, read_filter=F)
	dms_dt2 <- tardbpdms__load_toxicity(file.path(outpath, names(toxicity_list)[2]), object=T, read_filter=F)

	#Rename column names consistently
	names(dms_dt1)[grep('toxicity[1-4]$', names(dms_dt1))] <- paste0("toxicity", 1:length(grep('toxicity[1-4]$', names(dms_dt1))))
	names(dms_dt2)[grep('toxicity[1-4]$', names(dms_dt2))] <- paste0("toxicity", 1:length(grep('toxicity[1-4]$', names(dms_dt2))))
	names(dms_dt1)[grep('toxicity[1-4]_cond$', names(dms_dt1))] <- paste0("toxicity", 1:length(grep('toxicity[1-4]_cond$', names(dms_dt1))), "_cond")
	names(dms_dt2)[grep('toxicity[1-4]_cond$', names(dms_dt2))] <- paste0("toxicity", 1:length(grep('toxicity[1-4]_cond$', names(dms_dt2))), "_cond")

	#Combine regions
	dms_dt1[, region := names(toxicity_list)[1]]
	dms_dt2[, region := names(toxicity_list)[2]]
	dms_dt <- rbind(
		dms_dt1[,.SD,,.SDcols = c("Nmut_aa", "region", "toxicity", "toxicity_cond", "toxicity_uncorr", "STOP", "mean_count", "is.reads0",
			names(dms_dt1)[grep('toxicity[1-4]$', names(dms_dt1))], names(dms_dt1)[grep('toxicity[1-4]_cond$', names(dms_dt1))])], 
		dms_dt2[,.SD,,.SDcols = c("Nmut_aa", "region", "toxicity", "toxicity_cond", "toxicity_uncorr", "STOP", "mean_count", "is.reads0",
			names(dms_dt2)[grep('toxicity[1-4]$', names(dms_dt2))], names(dms_dt2)[grep('toxicity[1-4]_cond$', names(dms_dt2))])])

	### Mean counts vs uncorrected toxicity
	###########################

	#Set uncorrected toxicity to toxicity for singles
	dms_dt[Nmut_aa==1, toxicity_uncorr := toxicity]
	#Set conditional toxicity to toxicity for singles
	dms_dt[Nmut_aa==1, toxicity_cond := toxicity]
	plot_df <- reshape2::melt(as.data.frame(dms_dt[,.(mean_count, toxicity_uncorr, toxicity_cond, region)]), id = c("mean_count", "region"))
	d <- ggplot2::ggplot(plot_df, ggplot2::aes(mean_count, value)) +
		ggplot2::scale_x_log10() +
    ggplot2::stat_binhex(bins=30) +
	  ggplot2::xlab("Mean input read count") +
	  ggplot2::ylab("Toxicity") +
	  ggplot2::geom_hline(yintercept = 0, linetype=2) +
	  ggplot2::geom_vline(xintercept = 10, linetype=2, colour = colour_scheme[["shade 0"]][[1]]) +
	  ggplot2::theme_bw() +
	  ggplot2::facet_grid(variable ~ region)
	suppressWarnings(suppressMessages(ggplot2::ggsave(file=file.path(outpath, 'toxicity_vs_mean_input_read_count_scale.pdf'), width=5, height=4)))

	### Scatterplots comparing toxicity estimates between replicates
	###########################

	#Singles, doubles and STOPs mutants with defined toxicity
	dms_dt <- dms_dt[is.reads0==T & (Nmut_aa==1 | Nmut_aa==2) & (!is.na(toxicity) | !is.na(toxicity_cond))]
	# dms_dt <- dms_dt[(Nmut_aa==1 | Nmut_aa==2) & (!is.na(toxicity) | !is.na(toxicity_cond))]
	dms_dt[Nmut_aa==2, toxicity1 := toxicity1_cond]
	dms_dt[Nmut_aa==2, toxicity2 := toxicity2_cond]
	dms_dt[Nmut_aa==2, toxicity3 := toxicity3_cond]
	# dms_dt[Nmut_aa==2, toxicity4 := toxicity4_cond]

	#Convert infinite toxicity values to NA
	dms_dt[is.infinite(toxicity1), toxicity1 := NA]
	dms_dt[is.infinite(toxicity2), toxicity2 := NA]
	dms_dt[is.infinite(toxicity3), toxicity3 := NA]
	# dms_dt[is.infinite(toxicity4), toxicity4 := NA]

	#Number of variants with defined toxicity values in each replicate
	print(paste0("All AA variants (single, double AA substitutions): ", dms_dt[,.N], " (", paste0(unlist(dms_dt[,.N,Nmut_aa][,2]), collapse = ", "), ")"))
	print(paste0("Rep1 AA variants (single, double AA substitutions): ", dms_dt[!is.na(toxicity1),.N], " (",paste0(unlist(dms_dt[!is.na(toxicity1),.N,Nmut_aa][,2]), collapse = ", "), ")"))
	print(paste0("Rep2 AA variants (single, double AA substitutions): ", dms_dt[!is.na(toxicity2),.N], " (",paste0(unlist(dms_dt[!is.na(toxicity2),.N,Nmut_aa][,2]), collapse = ", "), ")"))
	print(paste0("Rep3 AA variants (single, double AA substitutions): ", dms_dt[!is.na(toxicity3),.N], " (",paste0(unlist(dms_dt[!is.na(toxicity3),.N,Nmut_aa][,2]), collapse = ", "), ")"))
	# dms_dt[!is.na(toxicity4),.N]

	#Plot replicate toxicity correlation
	num_rep = 3
	# num_rep = 4
	tardbpdms__ggpairs_binhex(
		input_dt = dms_dt[Nmut_aa %in% c(1,2) & region=="290",.SD,.SDcols = paste0('toxicity', c(1:num_rep))], 
		# input_dt_upper = dms_dt[Nmut_aa==1,.SD,.SDcols = paste0('toxicity', c(1:num_rep))],
		output_file = file.path(outpath, "replicate_scatter_binhex_290_scale.pdf"),
		xlab = "Toxicity",
		ylab = "Toxicity",
		width = 5,
		height = 5,
		label_size = 2,
  	plot_colours = c(colour_scheme[["shade 0"]][[1]], colour_scheme[["shade 0"]][[2]], colour_scheme[["shade 0"]][[3]]),
  	colour_limits = c(0, 100))
	tardbpdms__ggpairs_binhex(
		input_dt = dms_dt[Nmut_aa %in% c(1,2) & region=="332",.SD,.SDcols = paste0('toxicity', c(1:num_rep))], 
		# input_dt_upper = dms_dt[Nmut_aa==1,.SD,.SDcols = paste0('toxicity', c(1:num_rep))],
		output_file = file.path(outpath, "replicate_scatter_binhex_332_scale.pdf"),
		xlab = "Toxicity",
		ylab = "Toxicity",
		width = 5,
		height = 5,
		label_size = 2,
  	plot_colours = c(colour_scheme[["shade 0"]][[1]], colour_scheme[["shade 0"]][[2]], colour_scheme[["shade 0"]][[3]]),
  	colour_limits = c(0, 100))

	#Plot replicate 1 and 2 correlation
	plot_dt <- copy(dms_dt)[Nmut_aa %in% c(1,2) & !is.na(toxicity1) & !is.na(toxicity2)]
  plot_colours = c(colour_scheme[["shade 0"]][[1]], colour_scheme[["shade 0"]][[2]], colour_scheme[["shade 0"]][[3]])
  temp_cor <- plyr::ddply(plot_dt, c("region"), plyr::summarize, cor = round(cor(toxicity1, toxicity2, use = "pairwise.complete.obs"), 2), n = length(toxicity1))
  d <- ggplot2::ggplot(plot_dt, ggplot2::aes(toxicity1, toxicity2)) +
  	ggplot2::stat_binhex(bins=50) +
  	ggplot2::geom_abline(linetype = 2) + 
	  ggplot2::geom_text(data = temp_cor, ggplot2::aes(label = paste0("R = ", cor, "\nn = ", n)), x = 0, y = 0, size = 2) +
    ggplot2::scale_fill_gradientn(colours=plot_colours,name = "Frequency", na.value=plot_colours[length(plot_colours)], limits=c(0, 100)) +
    ggplot2::ylab("Toxicity (rep2)") +
    ggplot2::xlab("Toxicity (rep1)") +
    ggplot2::theme_classic() + 
    ggplot2::facet_grid(region ~.)
  ggplot2::ggsave(file=file.path(outpath, "replicate1_replicate2_scatter_scale.pdf"), width=5, height=5)

}
lehner-lab/tardbpdms documentation built on July 19, 2019, 7:24 p.m.