R/plotptm.R

Defines functions ptm.plot find.aaindex

Documented in find.aaindex ptm.plot

## ---------- plot_ptm.R ---------- ##
#                                    #
#     plot.ptm                       #
#     find.aaindex                   #
#                                    #
## -------------------------------- ##

## ------------------------------------------------------------------------------- ##
# ptm.plot <- function(up_id, pdb="", property, ptm, dssp, window, sdata=TRUE, ...)#
## ------------------------------------------------------------------------------- ##
#' Plot Values of a Property and PTM Sites Along the Protein Sequence
#' @description Represents the values of a property and show the PTM sites along a protein sequence.
#' @usage ptm.plot(up_id, pdb = "", property, ptm, dssp = 'compute',
#'                 window = 1, sdata = TRUE, ...)
#' @param up_id a character string for the UniProt ID of the protein of interest.
#' @param pdb Optional argument to indicate the PDB and chain to be used (i.e. '1u8f.O'). If we leave this argument empty, the function will make the election for us when possible. If we introduce 'none', no PDB will be used.
#' @param property a character string indicating the property of interest. It should be one of 'sasa', 'acc', 'dpx', 'entropy7.aa','entropy7.codon', 'entropy100.aa', 'entropy100.codon', 'eiip', 'volume', 'polarizability', 'av.hyd', 'pi.hel', 'a.hel', 'b.sheet', 'B.factor', or 'own'.
#' @param ptm a character vector indicating the PTMs of interest. It should be among: 'ac' (acetylation), 'me' (methylation), 'meto' (sulfoxidation), 'p' (phosphorylation), 'su' (sumoylation) or 'ub' (ubiquitination), 'gl' (glycosylation),'reg' (regulatory), 'dis' (disease).
#' @param dssp character string indicating the method to compute DSSP. It should be either 'compute' or 'mkdssp'.
#' @param window positive integer indicating the window size for smoothing with a sliding window average (default: 1, i.e. no smoothing).
#' @param sdata logical, if TRUE save a Rda file with the relevant data in the current directory.
#' @param ... when the user want to use his/her own amino acid index, it can be passed as a named vector.
#' @details If the property 'own' is selected, a named vector with the own index for the 20 amino acids should be passed as argument.
#' Currently the supported properties are:
#' * sasa:  Solvent-accessible surface area (3D)
#' * acc: Accessibility (3D)
#' * dpx: Depth (3D)
#' * volume: Normalized van der Waals volume (1D)
#' * mutability: Relative mutability, Jones 1992, (1D)
#' * helix: Average relative probability of helix, Kanehisa-Tsong 1980,(1D)
#' * beta-sheet: Average relative probability of beta-sheet, Kanehisa-Tsong 1980, (1D)
#' * pi-helix: Propensity of amino acids within pi-helices, Fodje-Al-Karadaghi 2002, (1D)
#' * hydropathy: Hydropathy index, Kyte-Doolittle 1982, (1D)
#' * avg.hyd: Normalized average hydrophobicity scales, Cid et al 1992, (1D)
#' * hplc: Retention coefficient in HPLC at pH7.4, Meek 1980, (1D)
#' * argos: Hydrophobicity index, Argos et al 1982, (1D)
#' * eiip: Electron-ion interaction potential, Veljkovic et al 1985, (1D)
#' * polarizability: Polarizability parameter, Charton-Charton 1982, (1D)
#' * entropy7.aa: Shannon entropy based on 7 species and protein sequences (EVO)
#' * entropy100.aa: Shannon entropy based on 100 species and protein sequences (EVO)
#' * entropy7.condon: Shannon entropy based on 7 species and codon sequences (EVO)
#' * entropy100.codon: Shannon entropy based on 100 species and codon sequences (EVO)
#' For 3D properties such as sasa, acc or dpx, for which different values can be obtained depending on the cuaternary structure, we first compute the property values for each residue in the whole protein and ploted agains the residue position. Then, the value for this property is computed in the isolated chain (a single polypeptide chain) and in a second plot, the differences between the values in the whole protein and the chain are ploted against the residue position.
#' @return This function returns either one or two plots related to the choosen property along the primary structure, as well as the computed data if sdata has been setted to TRUE.
#' @author Juan Carlos Aledo
#' @examples ptm.plot('P04406', property = 'sasa', window = 10, ptm = 'meto')
#' @examples \dontrun{ptm.ptm('P04406', property = 'dpx', ptm = c('meto', 'p'))}
#' @seealso find.aaindex()
#' @importFrom graphics par
#' @importFrom graphics mtext
#' @importFrom graphics points
#' @export


ptm.plot <- function(up_id, pdb = "", property, ptm , dssp = 'compute',
                     window = 1, sdata = TRUE, ...){

  aa <- aai$aa
  ## ------------------------------------------------------------------- ##
  ## ---------------- Is there any PDB for this protein? --------------- ##
  ## ------------------------------------------------------------------- ##
  if (nchar(pdb) == 6){ # ----- the user provide the PDB and chain
    t <- strsplit(pdb, split = "\\.")[[1]]
    pdb_id <- t[1]
    pdb_chain <- t[2]
    exists.pdb <- TRUE
  } else if (pdb == ""){ # ---- the PDB is selected by the script
    pdb <- pdb.select(up_id)
    if (pdb[[1]][1] == 'NO PDB FOUND'){
      exists.pdb <- FALSE
    } else {
      exists.pdb <- TRUE
      pdb_id <- pdb[[1]][1]
      pdb_chain <- pdb[[2]][1]
      pdb_coverage <- attributes(pdb)$coverage
    }
  } else {
    stop("Wrong pdb input!")
  }


  ## ------------------------------------------------------------------- ##
  ## ------------------- Use of cache when possible -------------------- ##
  ## ------------------------------------------------------------------- ##
  file_scan <- paste("./plotptm_cache/scan_", up_id, ".Rda", sep = "")
  if (file.exists(file_scan)){
    load(file_scan)
  }

  if (exists.pdb){
    file_sse <- paste("./plotptm_cache/sse_", pdb_id, ".Rda", sep = "")
    if (file.exists(file_sse)){
      load(file_sse)
    }
  }


  ## ------------------------------------------------------------------- ##
  ## ------- Checking that a suitable property has been selected ------- ##
  ## ------------------------------------------------------------------- ##
  pdb_property <- c('sasa', 'acc', 'dpx')
  aa_property <- c('volume','mutability', 'helix', 'beta-sheet', 'pi-helix',
                  'hydropathy', 'avg.hyd', 'hplc', 'argos', 'eiip',
                  'polarizability')

  names(aa_property) <- c('FAUJ880103', 'JOND920102', 'KANM800101', 'KANM800102',
                       'FODM020101', 'KYTJ820101', 'CIDH920105', 'MEEJ800101',
                       'ARGP820101', 'VELV850101', 'CHAM820101')

  evo_property <- c('entropy7.aa', 'entropy100.aa','entropy7.condon', 'entropy100.codon')

  all_property <- c(pdb_property, aa_property, evo_property, 'own')

  if (! property %in% all_property){
    stop("A proper property must be indicated")
  }

  if (!exists.pdb & property %in% pdb_property){
    stop('This property cannot be computed because no PDB file could be found')
  }

  if (!is.numeric(window) || window < 1){
    stop("'window' must be numeric and positive")
  }

    if (property == 'own'){
    z <- list(...)
    if (length(z) == 0){
      stop("No aa index has been provided")
    } else {
      index <- z[[1]]
    }

    if (sum(aa == names(index)) != 20){
      stop("The provided aa index must be a named numeric vector")
    }

  }

  ## ------------------------------------------------------------------- ##
  ## ---------- Checking that a suitable ptm has been selected --------- ##
  ## ------------------------------------------------------------------- ##
  supported_ptm <- c('ac', 'me', 'meto', 'p', 'su', 'ub', 'gl', 'sni',
                     'ni', 'reg', 'dis', 'all')
  names(supported_ptm) <- c('green', 'aquamarine', 'red', 'orange', 'deeppink',
                            'darkgreen', 'purple', 'yellow', 'blue',
                            'deepskyblue4', 'darkseagreen3', 'black')

  should_be_empty <- setdiff(ptm, supported_ptm)

  if (length(should_be_empty) != 0){
    stop("The supported PTMs are 'ac', 'me', 'meto', 'p', 'su', 'ub', 'gl', 'sni',
                     'ni', 'reg', 'dis'")
  }

  if (ptm[1] == 'all'){
    ptm <- supported_ptm
  }


  ## ------------------------------------------------------------------- ##
  ## -------------- Scanning the protein for PTM sites ----------------- ##
  ## ------------------------------------------------------------------- ##
  if (! 'scan' %in% ls()){
    scan <- suppressWarnings(ptm.scan(up_id))
    if (sdata){
      dir.create("plotptm_cache", showWarnings = FALSE)
      save(scan, file = paste("./plotptm_cache/scan_", up_id, ".Rda", sep = ""))
    }
  }
  if (! grepl("Sorry", scan)[1]){
    ptmScan <- scan[, c(2,3, which(colnames(scan) %in% ptm))]
    modifications <- !is.na(ptmScan[,3:dim(ptmScan)[2]])
    ptmScan$multi <- apply(as.matrix(modifications), 1, sum)
    ptmScan <- ptmScan[which(ptmScan$multi != 0), ]
    if (nrow(ptmScan) > 0){
      exists.ptm <- TRUE
      for (i in 1:nrow(ptmScan)){
        if (ptmScan$multi[i] == 1){
          ptmScan$col[i] <- names(ptmScan[which(ptmScan[i,-1] == TRUE) + 1][1])
        } else {
          ptmScan$col[i] <- "black"
        }
      }
      for (i in 1:nrow(ptmScan)){
        t <- ptmScan$col[i]
        if (t == "black"){
          color = "black"
        } else {
          color <- names(supported_ptm)[which(supported_ptm == t)]
        }
        ptmScan$col[i] <- color
      }
    } else { # no PTM sites found
      exists.ptm <- FALSE
    }
  } else {
    exists.ptm <- FALSE
  }


  ## ------------------------------------------------------------------- ##
  ## -------- Computing sse and raw 3d properties when required -------- ##
  ## ------------------------------------------------------------------- ##
  if (exists.pdb){
    if (! 'sse' %in% ls()){
      sse <- acc.dssp(pdb_id, dssp = dssp)
      if (sdata){
        dir.create("plotptm_cache", showWarnings = FALSE)
        save(sse, file = paste("./plotptm_cache/sse_", pdb_id, ".Rda", sep = ""))
      }
    }
    sse <- sse[which(sse$chain == pdb_chain), ]
    seq <- sse$aa # aa sequence
    names(seq) <- sse$respdb # position of each aa

    sse$sse <- NA
    for (i in 1:nrow(sse)){
      if (sse$ss[i] %in% c('G', 'H', 'I')){
        sse$sse[i] <- 1 # helixes arecoded as 1
      } else if (sse$ss[i] %in% c('E', 'B')){
        sse$sse[i] <- -1 # strands are coded as -1
      } else {
        sse$sse[i] <- 0 # coils are code as 0
      }
    }
    helices <- sse[which(sse$sse == 1),]
    strands <- sse[which(sse$sse == -1),]
    coils <- sse[which(sse$sse == 0),]

    if (window >= nrow(sse)) {
      stop("'window' must be smaller than the sequence length")
    }

    if (property == 'dpx'){
      dpx <- res.dpx(pdb_id)
    } else if (property == 'hetatom'){
      # -------- TO BE COMPLETED
    } else if (property == 'ncontacts'){
      # -------- TO BE COMPLETED
    }
  } else { # When there is no pdb to be used
    seq <- get.seq(up_id, as.string = FALSE)[[1]] # aa sequence
    names(seq) <- 1:length(seq) # aa position
    if (window >= length(aa)){
      stop("'window' must be smaller than the sequence length")
    }
  }

  ## ------------------------------------------------------------------- ##
  ## ---------- Computing the property values sequence ----------------- ##
  ## ------------------------------------------------------------------- ##
  if (property == 'own'){
    property_seq_mon <- property_seq_com <- index[seq]
  } else if (property %in% aa_property){
    i <- names(aa_property)[which(aa_property == property)]
    property_seq_mon <- property_seq_com <- bio3d::aa.index[[i]]$I[seq]
  } else if (property == "sasa"){
    property_seq_mon <- sse$sasa_chain
    property_seq_com <- sse$sasa_complex
  } else if (property == "acc"){
    property_seq_mon <- sse$acc_chain
    property_seq_com <- sse$acc_complex
  } else if (property == "dpx"){
    dpx <- res.dpx(pdb_id)[which(dpx$chain == pdb_chain), ]
    property_seq_mon <- dpx$min_dpx_chain
    property_seq_com <- dpx$min_dpx_complex
  } else if (property == "hetatom"){
    ## ----------------------------------------- TO BE COMPLETED ----------------- ##
  } else if (property == "ncontacts"){
    ## ----------------------------------------- TO BE COMPLETED ----------------- ##
  } else if (property == "entropy"){
    ## ----------------------------------------- TO BE COMPLETED ----------------- ##
  }

  ## ------------------------------------------------------------------- ##
  ## ------- Smoothing the computed property values sequence ----------- ##
  ## ------------------------------------------------------------------- ##

  smooth <- function(x, window){
    if (window == 1) {
      y <- x
    } else {
      n <- length(x)
      y <- rep(NA, n)
      w <- ceiling(window/2)
    }
    if ((window%%2) == 0) {

      from <- w
      to <- n - w
      y[from:to] <- sapply(from:to, function(i) mean(x[(i - w + 1):(i + w)], na.rm = TRUE))
      if (from - 1 > 0) {
        y[1:(from - 1)] <- sapply(1:(from - 1), function(i) mean(x[1:(i + w)], na.rm = TRUE))
      }
      y[(to + 1):n] <- sapply((to + 1):n, function(i) mean(x[(i - w + 1):n], na.rm = TRUE))

    } else {

      from <- w
      to <- n - (w - 1)
      y[from:to] <- sapply(from:to, function(i) mean(x[(i - w + 1):(i + w - 1)], na.rm = TRUE))

      y[1:(from - 1)] <- sapply(1:(from - 1), function(i) mean(x[1:(i + w - 1)], na.rm = TRUE))

      y[(to + 1):n] <- sapply((to + 1):n, function(i) mean(x[(i - w + 1):n], na.rm = TRUE))

    }
    y <- round(y, 2)
    return(y)
  }

  if (window > 1){
    s_property_seq_mon <- smooth(property_seq_mon, window = window)
    s_property_seq_com <- smooth(property_seq_com, window = window)
  } else {
    s_property_seq_mon <- property_seq_mon
    s_property_seq_com <- property_seq_com
  }


  ## ---------------------------------------------------------------- ##
  ## --------------------- Dependent Variable  ---------------------- ##
  ## ---------------------------------------------------------------- ##

  if (property == 'sasa'){
    y <- s_property_seq_com
    ylab <- expression(paste('SASA (', ring(A)^2, ')', sep = ""))
    dy <- s_property_seq_mon - s_property_seq_com
    dylab <- expression(paste(Delta, 'SASA (', ring(A)^2, ')', sep = ""))
  } else if (property == 'acc'){
    y <- s_property_seq_com
    ylab <- 'Accessibility'
    dy <- s_property_seq_mon - s_property_seq_com
    dylab <- expression(paste(Delta, "Accessibility"))
  } else if (property == 'dpx'){
    y <- s_property_seq_com
    ylab <- expression(paste('Depth (', ring(A), ')', sep = ""))
    dy <- s_property_seq_com - s_property_seq_mon
    dylab <- expression(paste(Delta,'Depth (', ring(A), ')', sep = ""))
  } else if (property == 'hetatm'){
    ## ------- TO BE COMPLETED
    ylab <- expression(paste('Minimal Distance to AS (', ring(A), ')', sep = ""))
  } else if (property == 'ncontacts'){
    ## ------- TO BE COMPLETED
    ylab <-'Total number of contacts'
  } else if (property == 'ncontacts.intra'){
    ## ------- TO BE COMPLETED
    ylab <-'Number of intramolecular contacts'
  } else if (property == 'ncontacts.inter'){
    ## ------- TO BE COMPLETED
    ylab <-'Number of intermolecular contacts'
  } else if (property == 'entropy7.aa'){
    ## ------- TO BE COMPLETED
    ylab <- "Shannon's entropy.aa7"
  } else if (property == 'entropy7.codon'){
    ## ------- TO BE COMPLETED
    ylab <- "Shannon's entropy.codon7"
  } else if (property == 'entropy100.aa'){
    ## ------- TO BE COMPLETED
    ylab <- "Shannon's entropy.aa100"
  } else if (property == 'entropy100.codon'){
    ## ------- TO BE COMPLETED
    ylab <- "Shannon's entropy.codon100"
  } else if (property == 'eiip'){
    y <- s_property_seq_com
    ylab <- "EIIP"
  } else if (property == 'volume'){
    y <- s_property_seq_com
    ylab <- "Volume"
  } else if (property == 'av.hyd'){
    y <- s_property_seq_com
    ylab <- "Averaged Hydrophobicity Index"
  } else if (property == 'pi-helix'){
    y <- s_property_seq_com
    ylab <- "pi-Helix Propensity"
  } else if (property == 'helix'){
    y <- s_property_seq_com
    ylab <- "Alpha-Helix Propensity"
  } else if (property == 'beta-sheet'){
    y <- s_property_seq_com
    ylab <- "Beta-Sheet Propensity"
  } else if (property == 'argos'){
    y <- s_property_seq_com
    ylab <- "Hydrophobicity index"
  } else if (property == 'mutability'){
    y <- s_property_seq_com
    ylab <- "Relative Mutability"
  } else if (property == 'hplc'){
    y <- s_property_seq_com
    ylab <- "Retention time in HPLC"
  } else if (property == "polarizability"){
    y <- s_property_seq_com
    ylab <- "Polarizability"
  } else if (property == 'own'){
    y <- s_property_seq_com
    ylab <- "Own Index"
  }

  ## ---------------------------------------------------------------- ##
  ## ------------------------ Ploting data -------------------------- ##
  ## ---------------------------------------------------------------- ##
  olmfrow <- par('mfrow')
  olmar <- par('mar')
  par(mar = c(2, 4.1, 2, 2.1))
  ## -------- When two plots are shown (chain vs complex) ----------- ##
  if (sum(s_property_seq_com != s_property_seq_mon) != 0){ # Two plots
    # layout(matrix(c(1,2), 2, 1, byrow = TRUE))
    par(mfrow = c(2,1))
    xlab = ""
  } else {
    xlab = "Residue Number"
  }

  ## -------------- Plot 1: Property in the complex ----------------- ##
  x <- as.numeric(names(seq))
  y <- s_property_seq_com
  names(y) <- names(seq)

  ymin <- min(y, na.rm = TRUE)
  ymax <- max(y, na.rm = TRUE)
  ylim <- c(ymin - 0.1*(ymax - ymin), ymax + 0.1*(ymax - ymin))
  i <- which(!is.na(y))
  plot(x[i], na.omit(y), type = 'l', xlab = xlab,
       ylim = ylim, ylab = "")
  mtext(text = ylab, side = 2, line = 2)

  if (exists.ptm){ # Add point corresponding to the PTM sites
    ptmScan <- ptmScan[which(ptmScan$n %in% x), ] # Only modified sites present in the PDB
    if (nrow(ptmScan) >= 1) { # PTM sites may not be found in the PDB structure
      for (i in 1:nrow(ptmScan)){
        t <- ptmScan$n[i]
        points(t, y[which(names(y) == t)], pch = 19,
               cex = 0.55*ptmScan$multi[i], col = ptmScan$col[i])
      }
    }
    if (exists.pdb){
      points(coils$respdb, rep(ymin - 0.07*(ymax - ymin), nrow(coils)),
             pch = 15, col = 'pink', cex = 0.5)
      points(helices$respdb, rep(ymin -0.07*(ymax - ymin), nrow(helices)),
             pch = 15, col = 'cyan', cex = 0.5)
      points(strands$respdb, rep(ymin -0.07*(ymax - ymin), nrow(strands)),
             pch = 15, col = 'magenta', cex = 0.5)
    }
  }

  ## -------------- Plot 2: Property in the monomer ----------------- ##
  if (sum(s_property_seq_mon != s_property_seq_com) != 0){
    # Plot 2: Changes in the property |complex - chain|
    names(dy) <- names(y)
    dymin <- min(y, na.rm = TRUE)
    dymax <- max(y, na.rm = TRUE)
    dylim <- c(ymin - 0.1*(dymax - dymin), ymax + 0.1*(dymax - dymin))
    i <- which(!is.na(dy))
    plot(x[i], dy, type = 'l', xlab = 'Residue Number', ylab = "", ylim = dylim)
    mtext(text = dylab, side = 2, line = 2)

    if (exists.ptm){
      if (nrow(ptmScan) >= 1) {
        for (i in 1:nrow(ptmScan)){
          t <- ptmScan$n[i]
          points(t, dy[which(names(dy) == t)], pch = 19,
                 cex = 0.55*ptmScan$multi[i], col = ptmScan$col[i])
        }
      }
    }
    points(coils$respdb, rep(dymin - 0.07*(dymax - dymin), nrow(coils)),
           pch = 15, col = 'pink', cex = 0.5)
    points(helices$respdb, rep(dymin -0.07*(dymax - dymin), nrow(helices)),
           pch = 15, col = 'cyan', cex = 0.5)
    points(strands$respdb, rep(dymin -0.07*(dymax - dymin), nrow(strands)),
           pch = 15, col = 'magenta', cex = 0.5)
  }


  par(mar = olmar) # restore par to its default value
  par(mfrow = olmfrow)

  output <- "Work done."
  attr(output, "uniprot") <- up_id
  if (exists.pdb){
    attr(output, "pdb") <- pdb_id
    attr(output, "chain") <- pdb_chain
  } else {
    attr(output, "pdb") <- "No pdb found for this protein"
  }

  return(output)
}


## ------------------------------------------------------------------------------- ##
#                               find.aaindex(word)                                    #
## ------------------------------------------------------------------------------- ##
#' Find the Amino Acid Indexes
#' @description Finds amino acid indexes.
#' @usage find.aaindex(word)
#' @param word a character string for the key-word of interest.
#' @return The number and ID of the indexes matching the requested word
#' @author Juan Carlos Aledo
#' @examples find.aaindex('mutability')
#' @examples find.aaindex('Kyte-Doolittle')
#' @seealso ptm.plot()
#' @references https://www.genome.jp/aaindex/AAindex/list_of_indices
#' @export

find.aaindex <- function(word){
  aseveration <- function(x){
    is <- length(grep(word, x$D, ignore.case = TRUE)) != 0
    return(is)
  }
  ind <- which(sapply(bio3d::aa.index, aseveration))
  return(ind)
}
jcaledo/ptm_0.1.1 documentation built on April 4, 2020, 3:48 a.m.