KINATESTID: A Pipeline for Kinase-Specific Artificial Peptide Design

Documented in substrates_qc

#' Generate QC report
#'
#' @param substrates_dt An enzymatic preference data.table. 
#' 
#' @param path The file path where output data should be saved.
#'
#' @return A ggplot2 object.
#' @export
#'
substrates_qc <- function(substrates_dt, path){
  unique_substrates <- bkgrnd_corr(substrates_dt)
  bin_dt <- unique(substrates_dt[,substrate_barcode, by = .(sample, control)])
  
  sample_bin_graph <- ggplot(bin_dt, aes(x = sample)) + 
    geom_histogram(stat = "count") + theme_bw() + 
    xlab("No of Samples Present") +
    ylab("No of Unique Substrates")
  
  venn_data <- peptide_intersect(substrates_dt)
  
  sub_heatmap <- substrates_heatmap(unique_substrates, T)
  
  counts_dt <- unique(substrates_dt[,substrate_barcode,by=.(file_name, class)])
  counts_graph <- ggplot(counts_dt, aes(x = file_name, fill = class)) + 
                  geom_histogram(stat = "count") + theme_bw() + 
                  facet_wrap(~class, scales = "free") +
                  theme(axis.text = element_text(angle = 45, hjust = 1), 
                  plot.margin = unit(c(0,1,0,2), "cm"), legend.position =
                  "none") + xlab("")
  
  top_row <- plot_grid(counts_graph)
  
  first_col <- plot_grid(sample_bin_graph, venn_data[[1]], ncol=1)
  second_col <- plot_grid(sub_heatmap)
  bottom_row <- plot_grid(first_col, second_col, ncol = 2, rel_widths = 
                         c(1, 1.5))
  
  
  gg_all <- plot_grid(top_row, bottom_row, ncol = 1, rel_heights = c(1,1.5))
  
  output_dir = file.path(path,"output")
  
  if (!dir.exists(output_dir)){
    dir.create(output_dir)
  }
  
  ggsave("substrates_qc_plot.jpg", gg_all, width = 8.5, height= 11, units = 
        c("in"), path = output_dir)
  
  qc_output <- list(gg_all, venn_data[[2]])
  return(qc_output)
}

#' Find peptide commonality across replicates
#'
#' @param substrates_dt An enzymatic preference data.table. 
#'
#' @return A ggplot2 object.
#' @export
#'
peptide_intersect <- function(substrates_dt){
  intersect_var = "replicate"
  sample_key <- unique(substrates_dt[, ..intersect_var])[[1]]
  no_samples <- length(sample_key)
  sample_perm <- combn(sample_key, 2, simplify=F)
  
  comparisons <- list()
  barcode <-  unique(substrates_dt[, substrate_barcode, by = intersect_var])
  barcode_lengths <- barcode[,length(unique(substrate_barcode)), 
                             by= intersect_var]
  
  for (i in 1:no_samples){
    comparisons[[i]] <-  unique(
      barcode[replicate == sample_key[i]][,substrate_barcode]
    )
    names(comparisons)[i] <- sample_key[i]
  }
  
  peptide_overlaps <- list()
  for (j in 1:length(sample_perm)){
    index<-paste(sample_perm[[j]])
    peptide_overlaps[[j]] <- Reduce(intersect, 
                                    comparisons[names(comparisons) %in% dput(index)])
    names(peptide_overlaps)[j] <- paste0(index, collapse = ":")
  }
  
  if (no_samples >=3){
    peptide_overlaps[[length(peptide_overlaps) + 1]] <- Reduce(intersect, 
                                                               peptide_overlaps)
    names(peptide_overlaps)[length(peptide_overlaps)] <- paste0(sample_key, 
                                                                collapse = ":")
  }
  
  if (no_samples == 1){
    venn_plot <- ggplot() + theme_void()
  }
  
  if (no_samples == 2){
    col_pal <- brewer.pal(no_samples, "Dark2")
    col_pal <- col_pal[1:no_samples] #workaround for required minimum colors
    area1 <- barcode_lengths[1][[2]]
    area2 <- barcode_lengths[2][[2]]
    n12 <- length(peptide_overlaps[[1]])
    labels <- paste("Replicate",sample_key, sep="\t ")
    venn_plot<- draw.pairwise.venn(area1, area2, n12, 
                                  scaled = T, fill = col_pal, col = 
                                  rep("white", 2), category = labels, 
                                  fontface = rep("bold", 3), cat.fontfamily = 
                                  rep("Arial", 2), cat.fontface = 
                                  rep("bold", 2), cat.col = col_pal, cex = 
                                  rep(1.5, 3), margin=0.08)
    venn_plot <- as.ggplot(grobTree(venn_plot))
  }
  
  if (no_samples == 3){
    col_pal <- brewer.pal(3, "Dark2")
    area1 <- barcode_lengths[1][[2]]
    area2 <- barcode_lengths[2][[2]]
    area3 <- barcode_lengths[3][[2]]
    n12 <- length(peptide_overlaps[[1]])
    n13 <- length(peptide_overlaps[[2]])
    n23 <- length(peptide_overlaps[[3]])
    n123 <- length(peptide_overlaps[[4]])
    labels <- paste("Replicate",sample_key, sep="\t ")
    venn_plot<- draw.triple.venn(area1, area2, area3, n12, n23, n13, n123, 
                                scaled = T, fill = col_pal, col =
                                rep("white", 3), category = labels, fontface =
                                rep("bold", 7), cat.fontfamily = 
                                rep("Arial", 3), cat.fontface = rep("bold", 3),
                                cat.col = col_pal, cex = rep(1.5, 7), 
                                margin=0.08)
    venn_plot <- as.ggplot(grobTree(venn_plot))
  }
  overlap_output <- list(venn_plot, peptide_overlaps)
  return(overlap_output)
  
}

#' Create a PSSM volcano plot
#'
#' @param pssm A list containing two data tables: 1) Exact Test P-values
#'  and 2) Fisher Exact Odds.
#'  
#' @param odds_cutoff The log 2 odds ratio threshold for being considered 
#' significant. If not specified it is set to 0.5 automatically.
#' 
#' @param pval_cutoff The threshold p-value to be considered significant. 
#' If not specified it is set to 0.05 automatically.
#'
#' @return A ggplot2 object
#' @export
#'
pssm_volcano <- function(pssm, odds_cutoff = NULL, pval_cutoff = NULL){
  if (is.null(odds_cutoff)){
    odds_cutoff <- 0.5
  }
  if(is.null(pval_cutoff)){
    pval_cutoff <- 0.05
  }
  
  pssm_long <- fisher_long(pssm)
  pssm_long[, log2Odds:= log2(fisher_odds)]
  
  volcano_plot <- EnhancedVolcano(pssm_long, lab = pssm_long[, barcode], 
                                  x='log2Odds', y='fisher_pval', 
                                  FCcutoff = odds_cutoff, 
                                  pCutoff = pval_cutoff, 
                                  boxedLabels = TRUE, 
                                  drawConnectors = TRUE, 
                                  title = NULL, 
                                  subtitle = NULL,
                                  xlab = bquote(~Log[2]~"fisher odds"))
  
}

#' Create heatmap of PSSM data
#'
#' @param substrates_dt An enzymatic preference data.table. 
#' 
#' @param scramble A logical value indicating whether the rows should be 
#' randomly shuffled. Set to FALSE by default.
#' 
#' @param seed An arbitrary value to generate reproducible random numbers. 
#' Only needed in cases where scramble is set to TRUE.
#'
#' @return A ggplot2 object.
#' @export
#'
substrates_heatmap <- function(substrates_dt, scramble = FALSE, seed = NULL){
  if (is.null(seed)){
    seed = 42
  }
  
  set.seed(seed)
  
  aa_prop_table <- aa_freq(substrates_dt)
  
  if (isTRUE(scramble)){
    ptm_col <- aa_prop_table[,`0`]
    order<-sample(1:nrow(aa_prop_table))
    aa_prop_table <- aa_prop_table[order, ]
    aa_prop_table$`0` <- ptm_col
    aa_clean <- Filter(function(x)!all(is.na(x)), aa_prop_table)
    aa_heatmap <- data.matrix(aa_clean[,-1])
    aa_heatmap_plot <- pheatmap(aa_heatmap, cluster_cols = F, cluster_rows = F, 
                                scale= "column")
    aa_heatmap_plot <- as.ggplot(aa_heatmap_plot)
  }else{
    aa_clean <- Filter(function(x)!all(is.na(x)), aa_prop_table)
    aa_heatmap <- data.matrix(aa_clean[,-1])
    rownames(aa_heatmap)<- aa_clean[[1]]
    aa_heatmap_plot <- pheatmap(aa_heatmap, cluster_cols = F, cluster_rows = F, 
                                scale= "column")
    aa_heatmap_plot <- as.ggplot(aa_heatmap_plot)
  }
  return(aa_heatmap_plot)  
}


silico_heatmap <- function(output_dt){
  dm_palette <- colorRampPalette(c("#FFFFFF", "#FFB7B4", "#FF6361", "#FF3127"))
  key <- data.table::data.table(perf = c("inactive", "low", "medium", "high"), 
                                color = c(0, 1, 2, 3))
  
  output_dt$perf <- factor(output_dt$perf, 
                           levels = c("inactive", "low", "medium", "high"))
  output_merge <- merge(output_dt, key, by = "perf")
  dm_rownames <- reshape2::dcast(output_merge, 
                                 substrate_barcode ~ kinase, 
                                 value.var = "color")[, 1]
  dm <- data.matrix(reshape2::dcast(output_merge, 
                                    substrate_barcode ~ kinase, 
                                    value.var = "color", 
                                    value.factor = TRUE)[,-1])
  rownames(dm) <- dm_rownames
  dm_plot <- pheatmap(t(dm), 
                      cluster_rows = FALSE, 
                      cluster_cols = FALSE, 
                      color = dm_palette(5))
  dm_plot <- as.ggplot(dm_plot)
}


plot_activity<- function(output_dt, kinase_set){
  output_dt$perf <- as.factor(output_dt$perf)
  output_dt$perf <- factor(output_dt$perf, 
                           levels = c("inactive", "low", "medium", "high"))
  output_dt$kinase <- droplevels(output_dt$kinase)
  output_dt$kinase <- factor(output_dt$kinase, levels = 
                            kinase_set[kinase_set %in% output_dt[,kinase]])
  output_plot <- ggplot(output_dt, aes (x = kinase, y = perf, color = perf)) + 
                 geom_point(size = 5)+ theme_bw() + 
                 facet_wrap(~substrate_barcode) + theme(axis.text = 
                 element_text(face = "bold", size = "12"), axis.title =
                 element_text(face = "bold", size = 12), legend.position = "none",
                 axis.text.x = element_text(angle = 45, vjust = 0.6), 
                 strip.text = element_text(face="bold", size = 12)) 
}

#' Plot amino acid properties across the flank sequence
#'
#' @param substrates_dt An enzymatic preference data.table. 
#' 
#' @param uniprot_dt A data.table containing in silico negative peptide 
#' sequences for each `uniprot_id` found in the `substrates_dt` file.
#' 
#' @param path The file path where output data should be saved.
#'
#' @return A ggplot2 object.
#' @export
#'
substrates_property <- function(substrates_dt, uniprot_dt, path){
  property = c("property_chemical", "property_hydropathy", "property_volume", 
              "property_polar")
  
  property_plots <- list()
  property_tables <- list()
  
  for (i in 1:length(property)){
    fisher_tables <- substrate_fisher_test(substrates_dt, uniprot_dt, 
                                           "aa_property", property[i])
    fisher_melt <- melt(fisher_tables[[2]], 
                        varnames = c("property","flank_pos"), 
                        value.name = "fisher_odds")
    fisher_plot <- 
      ggplot(fisher_melt, aes(x = flank_pos, y = fisher_odds, colour = property)) +
      geom_point(size = 2) + 
      facet_wrap(~property, scales = "free_x", ncol = 2) +
      geom_line(size = 1) + 
      geom_hline(yintercept = 1, linetype = "dashed") + 
      theme_bw() +
      xlab("") +
      ylab("Fisher Odds") +
      theme(legend.position = "none")
    
    property_plots[[i]] <- fisher_plot
    property_tables[[i]] <- fisher_tables
  }
  
  top_row <- plot_grid(property_plots[[1]], property_plots[[3]])
  bottom_row <- plot_grid(property_plots[[2]], property_plots[[4]])
  substrates_property_plot <- plot_grid(top_row, bottom_row, rel_heights = 
                                          c(1, 0.5),ncol=1)
  
  output_dir = file.path(path, "output")
  
  if (!dir.exists(output_dir)){
    dir.create(output_dir)
  }
  
  ggsave("substrates_properties_plot.jpg", substrates_property_plot, width = 11,
         height= 8.5, units = c("in"), path = output_dir)
  
  property_pval_combn <- rbind(property_tables[[1]][[1]], 
                               property_tables[[2]][[1]], 
                               property_tables[[3]][[1]], 
                               property_tables[[4]][[1]])
  
  property_odds_combn <- rbind(property_tables[[1]][[2]], 
                               property_tables[[2]][[2]], 
                               property_tables[[3]][[2]], 
                               property_tables[[4]][[2]])
  
  property_tables_combn <- list(property_pval_combn, property_odds_combn)
  return(property_tables_combn)
}