#' abetadms_combine_fitness
#'
#' Combine fitness estimates from two different DMS experiments (regions).
#'
#' @param fitness_list named list of folder paths with fitness estimates (required)
#' @param outpath output path for plots and saved objects (required)
#' @param disease_mut_file table of human disease mutations and classifications (required)
#' @param colour_scheme colour scheme file (required)
#' @param execute whether or not to execute the analysis (default: TRUE)
#'
#' @return A data.table with normalised fitness data
#' @export
#' @import data.table
abetadms_combine_fitness <- function(
fitness_list,
disease_mut_file,
outpath,
colour_scheme,
execute = TRUE
){
#Return previous results if analysis not executed
if(!execute){
load(file.path(outpath, "combine_fitness_values.RData"))
return(dms_dt_center_scale)
}
#Display status
message(paste("\n\n*******", "running stage: abetadms_combine_fitness", "*******\n\n"))
#Create output directory
abetadms__create_dir(abetadms_dir = outpath)
### Load data
###########################
# #Growth rate data
# gr_df <- read.table(growth_rate_file, header = F, skip = 1, stringsAsFactors = F, sep = "\t", quote = "", row.names = 1)[,1:4]
# #Mean growth rate table
# mean_gr_df <- data.frame(growth_rate = apply(log(gr_df), 1, mean, na.rm = T))
# mean_gr_df[is.na(mean_gr_df)] <- 0
# sd_gr_df <- data.frame(growth_rate = apply(log(gr_df), 1, sd, na.rm = T))
# sd_gr_df[is.na(sd_gr_df)] <- 0
#DMS variant data
dms_dt1 <- abetadms__load_fitness(fitness_list[[1]])
#Combine regions
dms_dt1[, region := names(fitness_list)[1]]
dms_dt <- dms_dt1
#Remove missing fitness values
dms_dt <- dms_dt[!is.na(fitness) | (!is.na(fitness_cond) & !is.na(fitness1) & !is.na(fitness2)),]
#Absolute position (singles)
dms_dt[, Pos_abs := Pos+as.numeric(region)-1]
#Absolute position (doubles)
dms_dt[, Pos_abs1 := Pos1+as.numeric(region)-1]
dms_dt[, Pos_abs2 := Pos2+as.numeric(region)-1]
#Mutation code (singles)
dms_dt[, mut_code := paste0(WT_AA, Pos_abs, Mut)]
#Mutation code (doubles)
dms_dt[, mut_code1 := paste0(WT_AA1, Pos_abs1, Mut1)]
dms_dt[, mut_code2 := paste0(WT_AA2, Pos_abs2, Mut2)]
# #Add growth rate data
# dms_dt[, growth_rate := mean_gr_df[mut_code,]]
# dms_dt[, growth_rate_low := mean_gr_df[mut_code,]-1.96*sd_gr_df[mut_code,]]
# dms_dt[, growth_rate_high := mean_gr_df[mut_code,]+1.96*sd_gr_df[mut_code,]]
#Annotate number and type of mutations
dms_dt[, mut_type := paste0("Nmut_aa: ", Nmut_aa, ", Nmut_codons: ", Nmut_codons)]
dms_dt[STOP==T, mut_type := "STOP"]
dms_dt[Nmut_codons >= 3 & Nmut_aa==0, mut_type := paste0("Nmut_aa: 0, Nmut_codons: >=3")]
dms_dt[, mut_type := factor(mut_type, levels = unique(mut_type[order(Nmut_aa+10*as.numeric(STOP), Nmut_codons, decreasing = F)]))]
#Add disease mutation information
dis_mut <- read.table(disease_mut_file, header = T, sep = "\t", stringsAsFactors = F, row.names = 1)
dms_dt[, hmut_cat := "healthy"]
dms_dt[mut_code %in% rownames(dis_mut), hmut_cat := "diseased"]
### 1. Untransformed data
###########################
#Plot distributions split by number and type of mutation
abetadms__plot_fitness_distributions(
input_dt = dms_dt,
output_file = file.path(outpath, 'single_double_syn_mutant_fitness_hist.pdf'),
colour_scheme = c("black", "darkgrey"))
# #Plot correlation between fitness estimates and growth rate
# abetadms__plot_fitness_vs_growthrate(
# input_dt = dms_dt,
# output_file = file.path(outpath, 'single_mutant_fitness_growthrate.pdf'),
# colour_scheme = c("black", "darkgrey"))
# #Plot correlation between fitness estimates and growth rate (separately for each region)
# for(i in 1:length(fitness_list)){
# abetadms__plot_fitness_vs_growthrate(
# input_dt = dms_dt[region==names(fitness_list)[i],],
# output_file = file.path(outpath, paste0('single_mutant_fitness_growthrate_', names(fitness_list)[i], '.pdf')),
# colour_scheme = c("black", "darkgrey"))
# }
### 2. Center fitness scores using weighted mean of single codon silent mutations
###########################
#Center fitness scores using weighted mean of single codon silent mutations
# silent_codon_center <- dms_dt[mut_type=="Nmut_aa: 0, Nmut_codons: 1",.(mean=median(fitness, na.rm = T)),region]
silent_codon_center <- dms_dt[mut_type=="Nmut_aa: 0, Nmut_codons: 1",.(mean=weighted.mean(fitness, sigma^-2, na.rm = T)),region]
dms_dt_center <- copy(dms_dt)
for(i in names(dms_dt_center)[grep("^fitness", names(dms_dt_center))]){
dms_dt_center[region==names(fitness_list)[1],(i) := .SD - silent_codon_center[region==names(fitness_list)[1],mean],,.SDcols=i]
}
#Plot distributions split by number and type of mutation
abetadms__plot_fitness_distributions(
input_dt = dms_dt_center,
output_file = file.path(outpath, 'single_double_syn_mutant_fitness_hist_center.pdf'),
colour_scheme = c("black", "darkgrey"))
# #Plot correlation between fitness estimates and growth rate
# abetadms__plot_fitness_vs_growthrate(
# input_dt = dms_dt_center,
# output_file = file.path(outpath, 'single_mutant_fitness_growthrate_center.pdf'),
# colour_scheme = c("black", "darkgrey"))
### 3. Scale fitness scores using ratio between weighted mean of single STOP codon fitness
###########################
#Scale fitness scores using ratio between weighted mean of single STOP codon fitness
stop_center <- dms_dt_center[mut_type=="STOP" & Nmut_aa==1,.(mean=weighted.mean(fitness, sigma^-2, na.rm = T)),region]
dms_dt_center_scale <- copy(dms_dt_center)
# for(i in names(dms_dt_center_scale)[grep("^fitness", names(dms_dt_center_scale))]){
# dms_dt_center_scale[region==names(fitness_list)[2], (i) := .SD * stop_center[region==names(fitness_list)[1],mean]/stop_center[region==names(fitness_list)[2],mean],,.SDcols=i]
# j <- gsub("fitness", "sigma", i)
# dms_dt_center_scale[region==names(fitness_list)[2], (j) := .SD * stop_center[region==names(fitness_list)[1],mean]/stop_center[region==names(fitness_list)[2],mean],,.SDcols=j]
# j <- gsub('fitness','fitness',i)
# dms_dt_center_scale[, (j) := -.SD[[1]],,.SDcols = i]
# }
#Plot distributions split by number and type of mutation
abetadms__plot_fitness_distributions(
input_dt = dms_dt_center_scale,
output_file = file.path(outpath, 'single_double_syn_mutant_fitness_hist_center_scale.pdf'),
colour_scheme = c("black", "darkgrey"))
# #Plot correlation between fitness estimates and growth rate
# abetadms__plot_fitness_vs_growthrate(
# input_dt = dms_dt_center_scale,
# output_file = file.path(outpath, 'single_mutant_fitness_growthrate_center_scale.pdf'),
# colour_scheme = c("black", "darkgrey"))
### Fitness w.r.t. position (singles)
###########################
#Repeat for before normalisation and after centering and scaling
dms_dt_list <- list(
"before_norm" = dms_dt,
"center" = dms_dt_center,
"center_scale" = dms_dt_center_scale
)
for(i in names(dms_dt_list)){
abetadms__plot_fitness_vs_position(
input_dt = dms_dt_list[[i]][STOP==T & Nmut_aa==1,],
use_sigma=T,
output_file=file.path(outpath, paste0('fitness_vs_position_STOP_', i, '.pdf')),
ylab="Relative fitness (log)",
label_offset = 0.01)
abetadms__plot_fitness_vs_position(
input_dt = dms_dt_list[[i]][STOP==F & Nmut_aa==1,],
use_sigma=F,
output_file=file.path(outpath, paste0('fitness_vs_position_single_AA_mutants_', i, '.pdf')),
ylab="Relative fitness (log)",
label_offset = 0.01)
abetadms__plot_fitness_vs_position(
input_dt = copy(dms_dt_list[[i]])[, fitness := abs(fitness)][STOP==F & Nmut_aa==1,],
use_sigma=F,
output_file=file.path(outpath, paste0('fitnessabs_vs_position_single_AA_mutants_', i, '.pdf')),
ylab="Absolute relative fitness (log)",
span = 0.25,
label_offset = 0.01,
plot_hotspot = T)
}
### Bounds on fitness
###########################
#Upper bound on fitness
upper_bound_F <- list()
for(i in 1:length(fitness_list)){
doubles_surpassing_lower_bound_F <- dms_dt_center_scale[is.fitness & Nmut_aa==2 & region==names(fitness_list)[i] & fitness_cond<stop_center[region==names(fitness_list)[i],mean],.N,]
doubles_negative_F <- dms_dt_center_scale[is.fitness & Nmut_aa==2 & region==names(fitness_list)[i] & fitness_cond<0,.N,]
upper_bound_quantile <- 1 - doubles_surpassing_lower_bound_F / doubles_negative_F
upper_bound_F[[names(fitness_list)[i]]] <- quantile(dms_dt_center_scale[is.fitness & Nmut_aa==2 & region==names(fitness_list)[i] & fitness_cond>0,fitness_cond,], upper_bound_quantile)
}
#Plot centered and scaled distributions split by number and type of mutation with bounds
temp_cols <- as.list(c("black", "black"))
names(temp_cols) <- c(names(fitness_list)[1], "STOP")
dms_dt_center_scale_boundaries <- list(
upper_bound_F[[1]],
"STOP" = stop_center[region==names(fitness_list)[1],mean])
names(dms_dt_center_scale_boundaries)[1] <- names(fitness_list)
abetadms__plot_fitness_distributions(
input_dt = dms_dt_center_scale,
output_file = file.path(outpath, "single_double_syn_mutant_fitness_hist_center_scale_bounds.pdf"),
boundary_list=dms_dt_center_scale_boundaries,
boundary_colour_list=temp_cols,
colour_scheme = c("black"))
#Plot centered and scaled distributions split by number and type of mutation with bounds
temp_cols <- as.list(c("black", "black"))
names(temp_cols) <- c(names(fitness_list)[1], "STOP")
dms_dt_center_scale_boundaries <- list(
upper_bound_F[[1]],
"STOP" = stop_center[region==names(fitness_list)[1],mean])
names(dms_dt_center_scale_boundaries)[1] <- names(fitness_list)
abetadms__plot_fitness_distributions(
input_dt = dms_dt_center_scale,
output_file = file.path(outpath, "single_double_syn_mutant_fitness_hist_center_scale_lowerbound.pdf"),
boundary_list=dms_dt_center_scale_boundaries[2],
boundary_colour_list=temp_cols[2],
colour_scheme = c("black"))
#Plot centered and scaled distributions split by number and type of mutation with bounds (x axis limits according to 99% of single and double mutant data)
xlim_focus <- quantile(unlist(dms_dt_center_scale[Nmut_aa==1 | Nmut_aa==2,.(fitness, fitness_cond)]), probs = c(0.005, 0.995), na.rm = T)
xlim_focus[1] <- (xlim_focus[1]-0.15)
xlim_focus[2] <- (xlim_focus[2]+0.15)
abetadms__plot_fitness_distributions(
input_dt = dms_dt_center_scale[Nmut_aa %in% c(1, 2) & !STOP,],
output_file = file.path(outpath, 'single_double_syn_mutant_fitness_hist_center_scale_bounds_xlimfocus.pdf'),
boundary_list=dms_dt_center_scale_boundaries,
boundary_colour_list=temp_cols,
xlim = xlim_focus,
colour_scheme = c("black"),
width = 4,
height = 3)
#Plot centered and scaled distributions split by number and type of mutation with STOP bound (x axis limits according to 99% of single and double mutant data)
xlim_focus <- quantile(unlist(dms_dt_center_scale[Nmut_aa==1 | Nmut_aa==2,.(fitness, fitness_cond)]), probs = c(0.005, 0.995), na.rm = T)
xlim_focus[1] <- (xlim_focus[1]-0.15)
xlim_focus[2] <- (xlim_focus[2]+0.15)
abetadms__plot_fitness_distributions(
input_dt = dms_dt_center_scale[Nmut_aa %in% c(1, 2) & !STOP,],
output_file = file.path(outpath, 'single_double_syn_mutant_fitness_hist_center_scale_lowerbound_xlimfocus.pdf'),
boundary_list=dms_dt_center_scale_boundaries[2],
boundary_colour_list=temp_cols[2],
xlim = xlim_focus,
colour_scheme = c("black"),
width = 4,
height = 3)
#Plot centered and scaled distributions (for singles and doubles combined) with STOP bound (x axis limits according to 99% of single and double mutant data)
xlim_focus <- quantile(unlist(dms_dt_center_scale[Nmut_aa==1 | Nmut_aa==2,.(fitness, fitness_cond)]), probs = c(0.005, 0.995), na.rm = T)
xlim_focus[1] <- (xlim_focus[1]-0.05)
xlim_focus[2] <- (xlim_focus[2]+0.05)
abetadms__plot_fitness_distributions(
input_dt = dms_dt_center_scale[Nmut_aa %in% c(1, 2) & !STOP,],
output_file = file.path(outpath, 'single_double_syn_mutant_fitness_hist_center_scale_upperbound_xlimfocus_combined.pdf'),
boundary_list=dms_dt_center_scale_boundaries[2],
boundary_colour_list=temp_cols[2],
xlim = xlim_focus,
colour_scheme = c("black"),
width = 4,
height = 3,
facet_by_mut_type = F)
#Plot centered and scaled distributions (for human disease mutations only)
d <- ggplot2::ggplot(dms_dt_center_scale[hmut_cat == "diseased",], ggplot2::aes(as.factor(0), fitness)) +
ggplot2::geom_boxplot() +
ggplot2::xlab("Fitness") +
ggplot2::theme_classic() +
ggplot2::coord_cartesian(ylim = xlim_focus)
ggplot2::ggsave(file=file.path(outpath, 'single_double_syn_mutant_fitness_hist_center_scale_upperbound_xlimfocus_combined_hmut.pdf'), width=1, height=4, useDingbats=FALSE)
### Mutation statistics
###########################
#Variant classification statistics
plot_dt <- dms_dt_center_scale
plot_dt[region == names(fitness_list)[1],.N,mut_type]
#Significantly more/less aggregation prone variants
tox_dt <- copy(dms_dt_center_scale)
tox_dt[Nmut_aa==1, fitness_z := fitness/sigma]
tox_dt[Nmut_aa==2, fitness_z := fitness_cond/sigma_cond]
tox_dt[, fitness_sig := p.adjust(2*pnorm(-abs(fitness_z)), method = "BH")<0.05]
#Singles
print(paste0("Significantly more aggregation prone single AA variants: ", tox_dt[Nmut_aa==1 & fitness_sig & fitness>0,.N,]))
print(paste0("Significantly less aggregation prone single AA variants: ", tox_dt[Nmut_aa==1 & fitness_sig & fitness<0,.N,]))
#Doubles
print(paste0("Significantly more aggregation prone double AA variants: ", tox_dt[Nmut_aa==2 & fitness_sig & fitness_cond>0,.N,]))
print(paste0("Significantly less aggregation prone double AA variants: ", tox_dt[Nmut_aa==2 & fitness_sig & fitness_cond<0,.N,]))
#Both
print(paste0("Significantly more aggregation prone AA variants: ", tox_dt[((Nmut_aa==1 & fitness>0) | (Nmut_aa==2 & fitness_cond>0)) & fitness_sig,.N,]))
print(paste0("Significantly less aggregation prone AA variants: ", tox_dt[((Nmut_aa==1 & fitness<0) | (Nmut_aa==2 & fitness_cond<0)) & fitness_sig,.N,]))
### Save combined fitness
###########################
#Write all single AA mutant tables to output files
fwrite(dms_dt[Nmut_aa==1 & !STOP,], file = file.path(outpath, "combine_fitness_values_singles_raw.txt"), sep = "\t")
fwrite(dms_dt_center[Nmut_aa==1 & !STOP,], file = file.path(outpath, "combine_fitness_values_singles_center.txt"), sep = "\t")
fwrite(dms_dt_center_scale[Nmut_aa==1 & !STOP,], file = file.path(outpath, "combine_fitness_values_singles_center_scale.txt"), sep = "\t")
#RData object
save(dms_dt, dms_dt_center, dms_dt_center_scale, dms_dt_center_scale_boundaries, file = file.path(outpath, "combine_fitness_values.RData"))
#Supplementary data file
fwrite(dms_dt_center_scale, file = file.path(outpath, "supplementary_table_normalised_fitness_estimates_all.txt"), sep = "\t")
#Return normalised fitness data.table
return(dms_dt_center_scale)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.