R/plot_prot.R

Defines functions plot_prot

Documented in plot_prot

#' Protein structure diagram.
#' 
#' Plots a diagram of protein structure based on several types of annotations and predictions.
#' 
#' @param sequence String representing a protein amino acid sequence.
#' @param id String representing a protein identifier. Will be converted using \code{\link[base]{make.names}}.
#' @param hyp_col Plotting color of predicted hydroxyproline positions. At default set to: '#868686FF'.
#' @param gpi_col Plotting color of the predicted omega site (glycosylphosphatidylinositol attachment). At default set to: '#0073C2FF'.
#' @param nsp_col Plotting color of the N-terminal signal peptide. At default set to: '#CD534CFF'.
#' @param ag_col Plotting color of the AG glycomodul spans. At default set to: '#E5E5E5FF'.
#' @param tm_col Plotting color of the transmembrane regions. At default set to: '#EFC000FF'.
#' @param hyp Boolean, should hydroxyprolines be plotted.
#' @param gpi A string indicating if \code{\link[ragp]{get_big_pi}} (gpi = "bigpi"), \code{\link[ragp]{get_pred_gpi}} (gpi = "predgpi") or \code{\link[ragp]{get_netGPI}} (gpi = "netgpi") should be called when predicting omega sites. To turn off omega site prediction use gpi = "none". At default set to "netgpi". Alternatively the output data frame of the mentioned functions (called with simplify = TRUE) can be supplied.
#' @param nsp A string indicating if \code{\link[ragp]{get_signalp5}} (nsp = "signalp5") or \code{\link[ragp]{get_signalp}} (nsp = "signalp") should be used to obtain N-sp predictions. Alternatively a data frame containing three columns: a character column "id" indicating the protein id as from input, a logical column "is.signalp" and an integer column "sp.length". See \code{\link[ragp]{get_signalp5}} or \code{\link[ragp]{get_signalp}} for details.
#' @param ag Boolean, should the AG glycomodul spans be plotted.
#' @param tm A string indicating if \code{\link[ragp]{get_phobius}} (tm = "phobius") or \code{\link[ragp]{get_tmhmm}} (tm = "tmhmm") should be used to obtain transmembrane region predictions. Alternatively a data frame with two columns:  a character column "id" indicating the protein id as from input and a "prediction" column containing the topology of the transmembrane regions (example "42o81-101i108-126o"). To turn off tm prediction use tm = "none".
#' @param domain A string indicating if \code{\link[ragp]{get_cdd}} (domain = "cdd") or \code{\link[ragp]{get_hmm}} (domain = "hmm") should be used to obtain domain annotation. Alternatively a data frame with five columns: a character column "id" indicating the protein id as from input, a character column "acc" indicating the accession of the domain hit, a character column "desc" indicating the description of the domain hit, a numeric column "align_start" indicating the start of the domain hit, a numeric column "align_end" indicating the end the domain hit. 
#' @param disorder Boolean, should disordered region predictions obtained using \code{\link[ragp]{get_espritz}} be plotted. Alternatively the output data frame from \code{\link[ragp]{get_espritz}} (called with simplify = TRUE) can be supplied.
#' @param dom_sort One of c("ievalue", "abc", "cba"), defaults to "ievalue". Domain plotting order. If 'ievalue' domains with the lowest ievalue as determined by hmmscan will be plotted above. If 'abc' or 'cba' the order is determined by domain Names.
#' @param progress Boolean, whether to show the progress bar, at default set to FALSE.
#' @param gpi_size Integer, the size of the gpi symbol. Appropriate values are 1 - 10.
#' @param gpi_shape Integer, the shape of the gpi symbol. Appropriate values are 0 - 25
#' @param hyp_scan Boolean, if ag = TRUE, should \code{\link[ragp]{scan_ag}} be performed on \code{\link[ragp]{predict_hyp}} output thus scanning only arabinogalactan motifs which contain predicted hydroxyprolines.
#' @param ... Appropriate arguments passed to \code{\link[ragp]{get_signalp5}}, \code{\link[ragp]{get_signalp}}, \code{\link[ragp]{get_espritz}}, \code{\link[ragp]{predict_hyp}}, \code{\link[ragp]{get_hmm}} and \code{\link[ragp]{scan_ag}}.
#'
#'
#' @return A ggplot2 plot object
#' 
#' @seealso \code{\link[ragp]{get_signalp5}} \code{\link[ragp]{get_signalp}} \code{\link[ragp]{get_phobius}} \code{\link[ragp]{get_tmhmm}} \code{\link[ragp]{get_hmm}} \code{\link[ragp]{get_cdd}} \code{\link[ragp]{get_espritz}} \code{\link[ragp]{predict_hyp}} \code{\link[ragp]{scan_ag}}
#'
#' @examples
#' library(ragp)
#' library(ggplot2)
#' ind <- c(23, 5, 80, 81, 345)
#' pred <- plot_prot(sequence = at_nsp$sequence[ind],
#'                   id = at_nsp$Transcript.id[ind])
#' pred +
#'   theme(legend.position = "bottom",
#'         legend.direction = "vertical")
#'         
#' #alternatively:      
#' nsp <- get_signalp(data = at_nsp[ind,],
#'                    id = Transcript.id,
#'                    sequence = sequence)
#'                     
#' hmm <- get_hmm(data = at_nsp[ind,], #default is to use get_cdd()
#'                id = Transcript.id,
#'                sequence = sequence) 
#'                
#' gpi <- get_netGPI(data = at_nsp[ind,],
#'                  id = Transcript.id,
#'                  sequence = sequence)                                    
#'
#' tm <- get_phobius(data = at_nsp[ind,],
#'                   id = Transcript.id,
#'                   sequence = sequence)                                                        
#'  
#' disorder <- get_espritz(data = at_nsp[ind,],	
#'                         id = Transcript.id,	
#'                         sequence = sequence)
#'         
#' pred2 <- plot_prot(sequence = at_nsp$sequence[ind],
#'                    id = at_nsp$Transcript.id[ind],
#'                    tm = tm,
#'                    nsp = nsp,
#'                    gpi = gpi,
#'                    domain = hmm,
#'                    disorder = disorder)
#'                    
#'                    
#' pred2 +
#'   theme(legend.position = "bottom",
#'         legend.direction = "vertical")
#'         
#' #mixing both methods is also a possibility
#'         
#' @export


plot_prot <- function(sequence,
                      id,
                      hyp_col = "#868686FF",
                      gpi_col = "#0073C2FF",
                      nsp_col = "#CD534CFF",
                      ag_col = "#E5E5E5FF",
                      tm_col = "#EFC000FF",
                      hyp = TRUE,
                      gpi = c("bigpi", "predgpi", "netgpi", "none"),
                      nsp = c("signalp", "signalp5", "none"),
                      ag = TRUE,
                      tm = c('phobius', "tmhmm", "none"),
                      domain = c("cdd", "hmm", "none"),
                      disorder = FALSE,
                      hyp_scan = if(ag == TRUE && hyp == TRUE) TRUE else FALSE,
                      dom_sort = c("ievalue", "abc", "cba"),
                      progress = FALSE,
                      gpi_size = 4,
                      gpi_shape = 18,
                      ...) {
  
  if (missing(sequence)){
    stop("protein sequence must be provided to obtain predictions",
         call. = FALSE)
  }
  if (missing(id)){
    stop("protein id must be provided to obtain predictions",
         call. = FALSE)
  }
  if (missing(progress)) {
    progress <- FALSE
  }
  if (length(progress) > 1){
    progress <- FALSE
    warning("progress should be of length 1, setting to default: progress = FALSE",
            call. = FALSE)
  }
  if (!is.logical(progress)){
    progress <- as.logical(progress)
    warning("progress is not logical, converting using 'as.logical'",
            call. = FALSE)
  }
  if (is.na(progress)){
    progress <- FALSE
    warning("progress was set to NA, setting to default: progress = FALSE",
            call. = FALSE)
  }
  id <- as.character(id)
  id_label <- id
  id <- make.names(id)
  sequence <- toupper(as.character(sequence))
  if (length(sequence) != length(id)){
    stop("id and sequence vectors are not of same length",
         call. = FALSE)
  }
  sequence <- sub("\\*$", "", sequence)
  aa_regex <- "[^ARNDCQEGHILKMFPSTWYVarndcqeghilkmfpstwyv]"
  
  if (any(grepl(aa_regex, sequence))){
    warning(paste("sequences: ",
                  paste(id[grepl(aa_regex, sequence)],
                        collapse = ", "), 
                  " contain symbols not corresponding to amino acids",
                  sep = ""),
            call. = FALSE)
  }
  
  if (missing(dom_sort)){
    dom_sort <- "ievalue"
  }
  if (!dom_sort %in% c("ievalue", "abc", "cba")) {
    stop("dom_sort should be one of: 'ievalue', 'abc', 'cba",
         call. = FALSE)
  }
  
  areColors <- function(x) {
    sapply(x, function(X) {
      tryCatch(is.matrix(grDevices::col2rgb(X)), 
               error = function(e) FALSE)
    })
  }
  
  if (length(hyp_col) > 1){
    hyp_col <- "#868686FF"
    warning("One color should be provided for hyp_col. Using default: '#868686FF'",
            call. = FALSE)
  }
  
  if (!areColors(hyp_col)){
    hyp_col <- "#868686FF"
    warning("hyp_col provided is not a valid color, default will be used: '#868686FF'",
            call. = FALSE)
  }
  
  if (length(gpi_col) > 1){
    gpi_col <- "#0073C2FF"
    warning("One color should be provided for gpi_col. Using default: '#0073C2FF'",
            call. = FALSE)
  }
  
  if (!areColors(gpi_col)){
    gpi_col <- "#0073C2FF"
    warning("gpi_col provided is not a valid color, default will be used: '#0073C2FF'",
            call. = FALSE)
  }
  
  if (length(nsp_col) > 1){
    nsp_col <- "#CD534CFF"
    warning("One color should be provided for nsp_col. Using default: '#CD534CFF'",
            call. = FALSE)
  }
  
  if (!areColors(nsp_col)){
    nsp_col <- "#CD534CFF"
    warning("nsp_col provided is not a valid color, default will be used: '#CD534CFF'",
            call. = FALSE)
  }
  
  if (length(ag_col) > 1){
    ag_col <- "#E5E5E5FF"
    warning("One color should be provided for ag_col. Using default: '#E5E5E5FF'",
            call. = FALSE)
  }
  
  if (!areColors(ag_col)){
    ag_col <- "#E5E5E5FF"
    warning("ag_col provided is not a valid color, default will be used: '#E5E5E5FF'",
            call. = FALSE)
  }
  
  if (length(tm_col) > 1){
    tm_col <- "#EFC000FF"
    warning("One color should be provided for tm_col. Using default: '#EFC000FF'",
            call. = FALSE)
  }
  
  if (!areColors(tm_col)){
    tm_col <- "#EFC000FF"
    warning("tm_col provided is not a valid color, default will be used: '#EFC000FF'",
            call. = FALSE)
  }
  
  if(missing(tm)){
    tm <-  "phobius"
  }
  if(is.character(tm)){
    if(!tm %in% c("phobius", "tmhmm", "none")){
      tm <-  "phobius"
      warning(paste("tm should be one of",
                    "'phobius'", "'tmhmm'", "'none';",
                    "setting to default: tm = 'phobius'"),
              call. = FALSE)
    }
    if (length(tm) > 1){
      tm <-  "phobius"
      warning("tm should be of length 1, setting to default: tm = 'phobius'",
              call. = FALSE)
    }
  }
  
  if (missing(disorder)){	
    disorder <- FALSE	
  }	
  
  if (is.logical(disorder)){	
    if (length(disorder) > 1){	
      disorder <- FALSE	
      warning("disorder should be of length 1, setting to default: disorder = FALSE",	
              call. = FALSE)	
    }	
  }
  
  if(missing(nsp)){
    nsp <-  "signalp5"
  }
  if(is.character(nsp)){
    if(!nsp %in% c("signalp5", "signalp", "none")){
      nsp <-  "signalp5"
      warning(paste("nsp should be one of",
                    "'signalp5'", "'signalp'", "'none';",
                    "setting to default: nsp = 'signalp5'"),
              call. = FALSE)
    }
    if (length(nsp) > 1){
      nsp <-  "signalp5"
      warning("nsp should be of length 1, setting to default: nsp = 'signalp5'",
              call. = FALSE)
    }
  }
  
  if (missing(ag)){
    ag <- TRUE
  }
  if (length(ag) > 1){
    ag <- TRUE
    warning("ag should be of length 1, setting to default: ag = TRUE",
            call. = FALSE)
  }
  if (!is.logical(ag)){
    ag <- as.logical(ag)
    warning("ag is not logical, converting using 'as.logical'",
            call. = FALSE)
  }
  if (is.na(ag)){
    ag <- TRUE
    warning("ag was set to NA, setting to default: ag = TRUE",
            call. = FALSE)
  }
  
  if (missing(hyp)){
    hyp <- TRUE
  }
  if (length(hyp) > 1){
    hyp <- TRUE
    warning("hyp should be of length 1, setting to default: hyp = TRUE",
            call. = FALSE)
  }
  if (!is.logical(hyp)){
    hyp <- as.logical(hyp)
    warning("hyp is not logical, converting using 'as.logical'",
            call. = FALSE)
  }
  if (is.na(hyp)){
    hyp <- TRUE
    warning("hyp was set to NA, setting to default: hyp = TRUE",
            call. = FALSE)
  }
  
  if (length(hyp_scan) > 1){
    hyp_scan <- if(ag == TRUE && hyp == TRUE) TRUE else FALSE
    warning("hyp_scan should be of length 1, setting to default: hyp_scan = if(ag == TRUE && hyp == TRUE) TRUE else FALSE",
            call. = FALSE)
  }
  if (!is.logical(hyp_scan)){
    hyp_scan <- as.logical(hyp_scan)
    warning("hyp_scan is not logical, converting using 'as.logical'",
            call. = FALSE)
  }
  if (is.na(hyp_scan)){
    hyp_scan <- if(ag == TRUE && hyp == TRUE) TRUE else FALSE
    warning("hyp_scan was set to NA, setting to default: hyp_scan = if(ag == TRUE && hyp == TRUE) TRUE else FALSE",
            call. = FALSE)
  }
  
  if (!hyp && isTRUE(hyp_scan)) {
    stop("hyp must be TRUE if scan_ag is to use hydroxyproline predictions for motif scan")
  }
  
  if (!ag && isTRUE(hyp_scan)) {
    warning("hyp_scan = TRUE and ag = FALSE, arabinogalactan motif scan will not be performed")
  }
  
  if(missing(domain)){
    domain <-  "cdd"
  }
  if(is.character(domain)){
    if(!domain %in% c("cdd", "hmm", "none")){
      domain <-  "cdd"
      warning(paste("domain should be one of",
                    "'cdd'", "hmm'", "'none';",
                    "setting to default: domain = 'cdd'"),
              call. = FALSE)
    }
    if (length(domain ) > 1){
      domain  <-  "cdd"
      warning("domain should be of length 1, setting to default: domain = 'cdd'",
              call. = FALSE)
    }
  }
  
  if(missing(gpi)){
    gpi <- 'netgpi'
  }
  if(is.character(gpi)){
    if(!gpi %in% c("bigpi", "predgpi", "netgpi", "none")){
      gpi <- 'netgpi'
      warning(paste("gpi should be one of",
                    "'bigpi', 'predgpi', 'netgpi', 'none',",
                    "setting to default: gpi = 'netgpi'"),
              call. = FALSE)
    }
    if (length(gpi) > 1){
      gpi <- 'netgpi'
      warning("gpi should be of length 1, setting to default: gpi = 'netgpi'",
              call. = FALSE)
    }
  }
  
  if(length(gpi_size) != 1){
    gpi_size <- 4
    warning("gpi_size should be of length 1, setting to default: gpi_size = 4",
            call. = FALSE)
  }
  if(!gpi_size %in% 1:10) {
    gpi_size <- 4
    warning("gpi_size can have values from 1 - 10, setting to default: gpi_size = 4",
            call. = FALSE)
  }
  
  if(length(gpi_shape) != 1){
    gpi_shape <- 18
    warning("gpi_shape should be of length 1, setting to default: gpi_shape = 18",
            call. = FALSE)
  }
  if(!gpi_shape %in% 0:25) {
    gpi_shape <- 18
    warning("gpi_shape can have values from 0 - 25, setting to default: gpi_shape = 18",
            call. = FALSE)
  }
  
  args_signalp <- names(formals(get_signalp.character))[2:8]
  args_signalp5 <- names(formals(get_signalp5.character))[2]
  args_scanag <- names(formals(scan_ag.default))[4:7]
  args_espritz <- names(formals(get_espritz.default))[4:5]
  args_hmm <- names(formals(get_hmm.default))[9:10]
  args_cdd <- names(formals(get_cdd.character))[c(2, 5:6)]
  dots <- list(...)
  
  dat <- data.frame(sequence = sequence,
                    id = id,
                    nchar = nchar(sequence),
                    stringsAsFactors = FALSE)
  
  dat$id <- factor(dat$id,
                   levels = unique(dat$id))
  
  dat$id_num <- as.numeric(dat$id)
  
  seq_hmm <- NULL
  if (is.character(domain)){
    if (domain == "hmm"){
      if(progress){
        message("querying hmmscan")
      }
      seq_hmm <- do.call(ragp::get_hmm,
                         c(list(data = dat,
                                sequence = "sequence",
                                id = "id",
                                progress = progress),
                           dots[names(dots) %in% args_hmm]))
      seq_hmm <- seq_hmm[seq_hmm$reported,]
      if(nrow(seq_hmm) != 0){
        seq_hmm <- seq_hmm[!is.na(seq_hmm$align_start),]
        
        seq_hmm$id <- factor(seq_hmm$id,
                             levels = unique(dat$id))
        
        seq_hmm$id_num <- as.numeric(seq_hmm$id)
        
        seq_hmm$domain <- with(seq_hmm,
                               paste(desc,
                                     " (",
                                     acc,
                                     ") ",
                                     sep = ""))
        if (dom_sort == "ievalue"){
          seq_hmm <- seq_hmm[with(seq_hmm, order(id_num,
                                                 as.numeric(ievalue),
                                                 decreasing = c(FALSE, TRUE),
                                                 method = "radix")),]
        }
        
        if (dom_sort == "abc"){
          seq_hmm <- seq_hmm[with(seq_hmm, order(id_num, domain)),]
        }
        if (dom_sort == "cba"){
          seq_hmm <- seq_hmm[with(seq_hmm, order(id_num,
                                                 domain,
                                                 decreasing = c(FALSE, TRUE),
                                                 method = "radix")),]
        }
      } else {
        seq_hmm <- NULL
      }
    }
    
    if (domain == "cdd"){
      if(progress){
        message("querying cdd")
      }
      seq_hmm <- do.call(ragp::get_cdd,
                         c(list(data = dat,
                                sequence = "sequence",
                                id = "id",
                                progress = progress),
                           dots[names(dots) %in% args_cdd]))
      if(nrow(seq_hmm) != 0){
        seq_hmm <- seq_hmm[!is.na(seq_hmm$align_start),]
        
        seq_hmm$id <- factor(seq_hmm$id,
                             levels = unique(dat$id))
        
        seq_hmm$id_num <- as.numeric(seq_hmm$id)
        
        seq_hmm$domain <- with(seq_hmm,
                               paste(desc,
                                     " (",
                                     acc,
                                     ") ",
                                     sep = ""))
        if (dom_sort == "ievalue"){
          seq_hmm <- seq_hmm[with(seq_hmm, order(id_num,
                                                 as.numeric(evalue),
                                                 decreasing = c(FALSE, TRUE),
                                                 method = "radix")),]
        }
        
        if (dom_sort == "abc"){
          seq_hmm <- seq_hmm[with(seq_hmm, order(id_num, domain)),]
        }
        if (dom_sort == "cba"){
          seq_hmm <- seq_hmm[with(seq_hmm, order(id_num,
                                                 domain,
                                                 decreasing = c(FALSE, TRUE),
                                                 method = "radix")),]
        }
      }
    } 
  }
  
  
  if(is.data.frame(domain)){
    seq_hmm <- domain
    if(any(!c("id",
              "acc",
              "desc",
              "align_start",
              "align_end") %in% colnames(seq_hmm))){
      stop(paste("domain data frame should have columns 'id', 'acc', 'desc'",
                 "'align_start', 'align_end'"))
    }
    seq_hmm$id <- make.names(seq_hmm$id)
    if(!all(seq_hmm$id %in% id)){
      stop("protein ids from domain do not match with id argument")
    }
    if(nrow(seq_hmm) != 0){
      seq_hmm <- seq_hmm[!is.na(seq_hmm$align_start),]
      
      seq_hmm$id <- factor(seq_hmm$id,
                           levels = unique(dat$id))
      
      seq_hmm$id_num <- as.numeric(seq_hmm$id)
      
      seq_hmm$domain <- with(seq_hmm,
                             paste(desc,
                                   " (",
                                   acc,
                                   ") ",
                                   sep = ""))

      if (dom_sort == "abc"){
        seq_hmm <- seq_hmm[with(seq_hmm, order(id_num, domain)),]
      }
      if (dom_sort == "cba"){
        seq_hmm <- seq_hmm[with(seq_hmm, order(id_num,
                                               domain,
                                               decreasing = c(FALSE, TRUE),
                                               method = "radix")),]
      }
    } else {
      seq_hmm <- NULL
    }
  }
  
  seq_signalp <- NULL
  if (is.character(nsp)){
    if (nsp == "signalp5") {
      if(progress){
        message("querying signalp5")
      }
      seq_signalp <- do.call(ragp::get_signalp5,
                             c(list(data = dat,
                                    sequence = "sequence",
                                    id = "id",
                                    progress = progress),
                               dots[names(dots) %in% args_signalp5]))
      
      seq_signalp <- seq_signalp[seq_signalp$is.signalp,]
      
      if (nrow(seq_signalp) != 0){
        seq_signalp$sp.length <- as.numeric(
          as.character(
            seq_signalp$sp.length
          )
        )
      } else {
        seq_signalp <- NULL
      }
    }
    
    if (nsp == "signalp") {
      if(progress){
        message("querying signalp")
      }
      seq_signalp <- do.call(ragp::get_signalp,
                             c(list(data = dat,
                                    sequence = "sequence",
                                    id = "id",
                                    progress = progress),
                               dots[names(dots) %in% args_signalp]))
      
      seq_signalp <- seq_signalp[seq_signalp$is.signalp,]
      
      if (nrow(seq_signalp) != 0){
        seq_signalp$sp.length <- as.numeric(
          as.character(
            seq_signalp$sp.length
          )
        )
      } else {
        seq_signalp <- NULL
      }
    }
  }
  
  

  if(is.data.frame(nsp)){
    seq_signalp <- nsp
    if(any(!c("id", "is.signalp", "sp.length") %in% colnames(seq_signalp))){
      stop(paste("nsp does not contain columns", "'id'", "'is.signalp'",  "and", "'sp.length'"))
    }
    seq_signalp$id <- make.names(seq_signalp$id)
    if(!all(seq_signalp$id %in% id)){
      stop("protein ids from nsp do not match with id argument")
    }
    if(!is.logical(seq_signalp$is.signalp)){
      stop("is.signalp column is not logical")
    }
    if(!is.numeric(seq_signalp$sp.length)){
      stop("sp.length column is not numeric")
    }
    seq_signalp <- seq_signalp[seq_signalp$is.signalp,]
    if(nrow(seq_signalp) != 0){
      seq_signalp$sp.length <- as.numeric(
        as.character(
          seq_signalp$sp.length
        )
      )
    } else {
      seq_signalp <- NULL
    }
  }
  
  phobius_seq <- NULL
  if (is.character(tm)){
    if (tm == "phobius") {
      if(progress){
        message("querying phobius")
      }
      phobius_seq <- ragp::get_phobius(data = dat,
                                       sequence = sequence,
                                       id = id,
                                       progress = progress)
    }
    
    if (tm == "tmhmm") {
      if(progress){
        message("querying tmhmm")
      }
      phobius_seq <- ragp::get_tmhmm(data = dat,
                                     sequence = sequence,
                                     id = id,
                                     progress = progress)
    }
  }
  
  
  
  if(is.data.frame(tm)){
    phobius_seq <- tm
    if(any(!c("id",
              "prediction") %in% colnames(phobius_seq))){
      stop("tm does not contain the columns 'id' and 'prediction'")
    }
    phobius_seq$id <- make.names(phobius_seq$id)
    if(!all(phobius_seq$id %in% id)){
      stop("protein ids from tm do not match with id argument")
    }
  }
  
  if (!is.null(phobius_seq)){
    phobius_seq <- merge(dat,
                         phobius_seq,
                         by.x = "id",
                         by.y = "id",
                         sort = FALSE)
    
    phobius_seq_pred <- unlist(
      lapply(
        strsplit(
          phobius_seq$prediction,
          "/"),
        function(x){
          if (length(x) == 2) {
            x[2]
          } else {
            x[1]
          }
        }
      )
    )
    phobius_seq <- data.frame(id = phobius_seq$id, 
                              id_num = phobius_seq$id_num,
                              pred = phobius_seq_pred,
                              is.tm = grepl("\\d+-\\d+",
                                            phobius_seq_pred),
                              seq = phobius_seq$sequence,
                              stringsAsFactors = FALSE)
    
    phobius_seq <- phobius_seq[phobius_seq$is.tm,]
    
    if (nrow(phobius_seq) == 0){
      phobius_seq <- NULL
    } else {
      tm <- stringr::str_extract_all(phobius_seq$pred, 
                                     "\\d+-\\d+")
      names(tm) <- phobius_seq$id
      tm <- lapply(tm, cbind)
      id_tm <- rep(phobius_seq$id_num, sapply(tm, nrow))
      tm <- do.call(rbind, tm)
      tm <- data.frame(tm, id_tm)
      tm$tm_start <- as.numeric(stringr::str_extract(tm$tm, 
                                                     "\\d+(?=-)"))
      tm$tm_end <- as.numeric(stringr::str_extract(tm$tm, 
                                                   "(?<=-)\\d+"))
      out <- stringr::str_extract_all(phobius_seq$pred, 
                                      "\\d+o\\d+|\\d+o|o\\d+")
      out <- lapply(out, cbind)
      id_out <- rep(phobius_seq$id_num, sapply(out, nrow))
      seq_out <- nchar(rep(phobius_seq$seq, sapply(out, 
                                                   nrow)))
      out <- do.call(rbind, out)
      out <- data.frame(out, id_out)
      out$out_start <- as.numeric(stringr::str_extract(out$out, 
                                                       "\\d+(?=o)"))
      out$out_start <- ifelse(is.na(out$out_start), 1, 
                              out$out_start)
      out$out_end <- as.numeric(stringr::str_extract(out$out, 
                                                     "(?<=o)\\d+"))
      out$out_end <- ifelse(is.na(out$out_end), seq_out, 
                            out$out_end)
      inside <- stringr::str_extract_all(phobius_seq$pred, 
                                         "\\d+i\\d+|\\d+i|i\\d+")
      inside <- lapply(inside, cbind)
      id_inside <- rep(phobius_seq$id_num, sapply(inside, 
                                                  nrow))
      seq_inside <- nchar(rep(phobius_seq$seq, sapply(inside, 
                                                      nrow)))
      inside <- do.call(rbind, inside)
      inside <- data.frame(inside, id_inside)
      inside$inside_start <- as.numeric(stringr::str_extract(inside$inside, 
                                                             "\\d+(?=i)"))
      inside$inside_start <- ifelse(is.na(inside$inside_start), 
                                    1, inside$inside_start)
      
      inside$inside_end <- as.numeric(stringr::str_extract(inside$inside, 
                                                           "(?<=i)\\d+"))
      inside$inside_end <- ifelse(is.na(inside$inside_end), 
                                  seq_inside,
                                  inside$inside_end)
    }
  }
  
  seq_gpi <- NULL
  if(is.character(gpi)){
    if (gpi == 'bigpi') {
      if(progress){
        message("querying big pi")
      }
      seq_gpi <- ragp::get_big_pi(data = dat,
                                  sequence = "sequence",
                                  id = "id",
                                  progress = progress)
      
      seq_gpi$omega_site <- as.numeric(seq_gpi$omega_site)
      seq_gpi <- seq_gpi[seq_gpi$is.gpi,]
      if (nrow(seq_gpi) == 0) {
        seq_gpi <- NULL
      }
    }
    
    if (gpi == 'predgpi') {
      if(progress){
        message("querying predGPI")
      }
      seq_gpi <- ragp::get_pred_gpi(dat,
                                    sequence = "sequence",
                                    id = "id",
                                    progress = progress)
      seq_gpi <- seq_gpi[seq_gpi$is.gpi,]
      
      if (nrow(seq_gpi) == 0) {
        seq_gpi <- NULL
      }
    }
    
    if (gpi == 'netgpi') {
      if(progress){
        message("querying NetGPI")
      }
      seq_gpi <- ragp::get_netGPI(dat,
                                  sequence = "sequence",
                                  id = "id",
                                  progress = progress)
      seq_gpi <- seq_gpi[seq_gpi$is.gpi,]
      
      if (nrow(seq_gpi) == 0) {
        seq_gpi <- NULL
      }
    }
  }
  
  if(is.data.frame(gpi)){
    seq_gpi <- gpi
    if(any(!c("id",
              "omega_site",
              "is.gpi") %in% colnames(seq_gpi))){
      stop("gpi is not the output from get_big_pi, get_pred_gpi or get_netGPI functions")
    }
    seq_gpi$id <- make.names(seq_gpi$id)
    if(!all(seq_gpi$id %in% id)){
      stop("protein ids from gpi do not match with id argument")
    }
    if(!is.logical(seq_gpi$is.gpi)){
      stop("is.gpi column is not logical")
    }
    seq_gpi <- seq_gpi[seq_gpi$is.gpi,]
    
    if (nrow(seq_gpi) == 0) {
      seq_gpi <- NULL
    }
  } 
  
  
  if (hyp) {
    seq_hyp <- do.call(ragp::predict_hyp,
                       c(list(data = dat,
                              sequence = "sequence",
                              id = "id"),
                         dots[names(dots) == "tprob"],
                         dots[names(dots) == "version"])
    )
    
    if (hyp_scan) hyp_ag_scan <- seq_hyp$sequence
    
    seq_hyp <- seq_hyp$prediction
    
    if (!is.null(seq_hyp)) {
      seq_hyp <- seq_hyp[seq_hyp$HYP == "Yes",]
    }
    
    if (!is.null(seq_hyp) && nrow(seq_hyp) == 0) {
      seq_hyp <- NULL
    }
  } else {
    seq_hyp <- NULL
  }
  
  if (ag && hyp_scan) {
    seq_scan <- do.call(ragp::scan_ag,
                        c(list(data = hyp_ag_scan,
                               sequence = "sequence",
                               id = "id",
                               simplify = FALSE,
                               tidy = TRUE),
                          dots[names(dots) %in% args_scanag]))
    
    seq_scan <- seq_scan[, -5]
    seq_scan <- seq_scan[!is.na(seq_scan$location.start),]
    
    if (nrow(seq_scan) == 0) {
      seq_scan <- NULL
    }
  } else if (ag && !hyp_scan) {
    seq_scan <- do.call(ragp::scan_ag,
                        c(list(data = dat,
                               sequence = "sequence",
                               id = "id",
                               simplify = FALSE,
                               tidy = TRUE),
                          dots[names(dots) %in% args_scanag]))
    
    seq_scan <- seq_scan[, -5]
    seq_scan <- seq_scan[!is.na(seq_scan$location.start),]
    
    if (nrow(seq_scan) == 0) {
      seq_scan <- NULL
    }
  } else {
    seq_scan <- NULL
  }
  
  seq_espritz <- NULL	
  if (isTRUE(disorder)) {	
    if(progress){	
      message("querying espritz")	
    }	
    seq_espritz <- do.call(get_espritz,	
                           c(list(data = dat,	
                                  sequence = "sequence",	
                                  id = "id",	
                                  progress = progress),	
                             dots[names(dots) %in% args_espritz]))	
    
    seq_espritz$id <- factor(seq_espritz$id,	
                             levels = unique(dat$id))	
    
    seq_espritz$id_num <- as.numeric(seq_espritz$id)	
    seq_espritz$start[is.na(seq_espritz$start)] <- 0	
    seq_espritz$end[is.na(seq_espritz$end)] <- 0	
    
    seq_espritz <- merge(seq_espritz,	
                         dat[,c(2,3)],	
                         by.x = "id",	
                         by.y = "id",	
                         all.y = TRUE,	
                         all.x = TRUE,	
                         sort = FALSE) 	
    
    seq_espritz <- split(seq_espritz ,	
                         seq_espritz$id)	
    
    seq_espritz <- lapply(seq_espritz, function(x){	
      start <-  x$start	
      end <- x$end	
      id <- unique(x$id)	
      id_num <- unique(x$id_num)	
      resD <- lapply(seq_along(start), function(i){	
        start[i]:end[i]	
      })	
      groups <- rep(seq_along(resD), times = lengths(resD))	
      data.frame(residue = unlist(resD),	
                 groups = groups,	
                 id = id,	
                 id_num = id_num,	
                 nchar = unique(x$nchar))	
    })	
    
    rle_predO <- lapply(seq_espritz, function(x){	
      n <- unique(x$nchar)	
      n <- 1:n	
      n <- setdiff(n, x$residue)	
      if(length(n) != 0){	
        s <- split(n, cumsum(c(0, diff(n) != 1)))	
        groups <- rep(seq_along(s), times = lengths(s))	
      } else {	
        n <- NA	
        groups <- 1	
      }	
      rle_dat <- data.frame(residue = n,	
                            id = unique(x$id),	
                            id_num = unique(x$id_num),	
                            group = groups)	
      
      rle_dat <- split(rle_dat, rle_dat$group)	
      rle_dat <- lapply(rle_dat, function(x){	
        data.frame(start = min(x$residue),	
                   end = max(x$residue),	
                   id = unique(x$id),	
                   id_num = unique(x$id_num))	
      })	
      rle_dat <- do.call(rbind, rle_dat)	
      rle_dat	
    })	
    
    rle_predO <- do.call(rbind, rle_predO)	
    rownames(rle_predO) <- NULL	
    
    plot_segments <- rle_predO[!is.na(rle_predO$start),]	
    
    seq_espritz <- do.call(rbind, seq_espritz) 	
    
    seq_espritz <- seq_espritz[seq_espritz$residue != 0,]	
    if (nrow(seq_espritz) != 0){	
      rownames(seq_espritz) <- NULL	
      seq_espritz$y <- rep(c(0, -0.1, 0, 0.1),	
                           length.out = nrow(seq_espritz))	
      seq_espritz$y_plot <- seq_espritz$y + seq_espritz$id_num	
      seq_espritz$inter <- interaction(seq_espritz$group,	
                                       seq_espritz$id_num)
    } else {	
      seq_espritz <- NULL	
    }	
  }	
  
  if(is.data.frame(disorder)){	
    seq_espritz <- disorder	
    if(any(!c("start",	
              "end",	
              "id") %in% colnames(seq_espritz))){	
      stop("disorder is not the output from get_espritz function")	
    }	
    seq_espritz$id <- make.names(seq_espritz$id)	
    if(!all(seq_espritz$id %in% id)){	
      stop("protein ids from disorder do not match with id argument")	
    }	
    seq_espritz$id <- factor(seq_espritz$id,	
                             levels = unique(dat$id))	
    seq_espritz$id_num <- as.numeric(seq_espritz$id)	
    seq_espritz$start[is.na(seq_espritz$start)] <- 0	
    seq_espritz$end[is.na(seq_espritz$end)] <- 0	
    
    seq_espritz <- merge(seq_espritz,	
                         dat[,c(2,3)],	
                         by.x = "id",	
                         by.y = "id",	
                         all.y = TRUE,	
                         all.x = TRUE,	
                         sort = FALSE) 	
    
    seq_espritz <- split(seq_espritz ,	
                         seq_espritz$id)	
    
    seq_espritz <- lapply(seq_espritz, function(x){	
      start <-  x$start	
      end <- x$end	
      id <- unique(x$id)	
      id_num <- unique(x$id_num)	
      resD <- lapply(seq_along(start), function(i){	
        start[i]:end[i]	
      })	
      groups <- rep(seq_along(resD), times = lengths(resD))	
      data.frame(residue = unlist(resD),	
                 groups = groups,	
                 id = id,	
                 id_num = id_num,	
                 nchar = unique(x$nchar))	
    })	
    
    rle_predO <- lapply(seq_espritz, function(x){	
      n <- unique(x$nchar)	
      n <- 1:n	
      n <- setdiff(n, x$residue)	
      if(length(n) != 0){	
        s <- split(n, cumsum(c(0, diff(n) != 1)))	
        groups <- rep(seq_along(s), times = lengths(s))	
      } else {	
        n <- NA	
        groups <- 1	
      }	
      rle_dat <- data.frame(residue = n,	
                            id = unique(x$id),	
                            id_num = unique(x$id_num),	
                            group = groups)	
      
      rle_dat <- split(rle_dat, rle_dat$group)	
      rle_dat <- lapply(rle_dat, function(x){	
        data.frame(start = min(x$residue),	
                   end = max(x$residue),	
                   id = unique(x$id),	
                   id_num = unique(x$id_num))	
      })	
      rle_dat <- do.call(rbind, rle_dat)	
      rle_dat	
    })	
    
    rle_predO <- do.call(rbind, rle_predO)	
    rownames(rle_predO) <- NULL	
    
    plot_segments <- rle_predO[!is.na(rle_predO$start),]	
    
    seq_espritz <- do.call(rbind, seq_espritz) 	
    
    seq_espritz <- seq_espritz[seq_espritz$residue != 0,]	
    if (nrow(seq_espritz) != 0){	
      rownames(seq_espritz) <- NULL	
      seq_espritz$y <- rep(c(0, -0.1, 0, 0.1),	
                           length.out = nrow(seq_espritz))	
      seq_espritz$y_plot <- seq_espritz$y + seq_espritz$id_num	
      seq_espritz$inter <- interaction(seq_espritz$group,	
                                       seq_espritz$id_num)
    } else {	
      seq_espritz <- NULL	
    }	
  }
  
  if (!is.null(seq_gpi)){
    for_plot <- merge(x = dat,
                      y = seq_gpi,
                      by = "id",
                      all.x = TRUE,
                      all.y = FALSE,
                      sort = FALSE)
  } else {
    for_plot <- dat
  }
  
  if (!is.null(seq_signalp)){
    for_plot <- merge(for_plot,
                      seq_signalp,
                      by = "id",
                      all.x = TRUE,
                      all.y = FALSE,
                      sort = FALSE)
  } else {
    for_plot <- for_plot
  }
  
  if (!is.null(dim(seq_hyp))){
    for_plot <- merge(for_plot,
                      seq_hyp,
                      by = "id",
                      all.x = TRUE,
                      all.y = FALSE,
                      sort = FALSE)
  } else {
    for_plot <- for_plot 
  }
  
  if (!is.null(dim(seq_scan))){
    for_plot <- merge(for_plot,
                      seq_scan,
                      by = "id",
                      all.x = TRUE,
                      all.y = FALSE,
                      sort = FALSE)
  } else {
    for_plot <- for_plot
  }
  
  lims <- c("N-sp", "omega site (gpi)", "hyp", "AG span", "TM")
  vals <- c(nsp_col, gpi_col, hyp_col, ag_col, tm_col)
  labs <- c("N-sp", "omega site (gpi)", "hyp", "AG span", "TM")
  
  subs <- c(!is.null(seq_signalp),
            !is.null(seq_gpi),
            !is.null(seq_hyp),
            !is.null(seq_scan),
            !is.null(phobius_seq))
  
  if (any(subs)){
    lims <- lims[subs]
    vals <- vals[subs]
    labs <- labs[subs]
  } else {
    lims <- NA
    vals <- NA
    labs <- NULL
  }
  rat <- max(nchar(sequence))
  rat <- round(rat/100, 0)*100
  rat <- rat/20
  
  if(!is.null(seq_espritz)){
    p <- ggplot2::ggplot(plot_segments)+
      ggplot2::geom_segment(ggplot2::aes_(y = ~id_num,
                                          yend = ~id_num,
                                          x = ~start,
                                          xend = ~end))
  } else {
    p <- ggplot2::ggplot(for_plot)+
      ggplot2::geom_segment(ggplot2::aes_(y = ~id_num,
                                          yend = ~id_num,
                                          x = 1,
                                          xend = ~nchar))
  }

  p <- p  +
    ggplot2::xlab("residue") +
    ggplot2::ylab("id") +
    ggplot2::scale_y_continuous(breaks = seq_along(id),
                                labels = id_label,
                                limits = c(0.5, length(id) + 0.5)) +
    ggplot2::theme_bw()+
    ggplot2::theme(panel.grid = ggplot2::element_blank(),
                   panel.border = ggplot2::element_blank(),
                   axis.ticks.y = ggplot2::element_blank(),
                   axis.line.x = ggplot2::element_line()) +
    ggplot2::coord_equal(ratio = rat,
                         expand = FALSE)
  
  if(!is.null(phobius_seq)){
    p <- p +
      ggplot2::geom_rect(data = tm,
                         ggplot2::aes_(ymin = ~id_tm - 0.35,
                                       ymax = ~id_tm + 0.35,
                                       xmin = ~tm_start,
                                       xmax = ~tm_end, color = "TM"),
                         fill = tm_col,
                         na.rm = TRUE) +
      ggplot2::geom_segment(data = inside,
                            ggplot2::aes_(y = ~id_inside - 0.35,
                                          yend = ~id_inside - 0.35,
                                          x = ~inside_start,
                                          xend = ~inside_end),
                            lty = 2,
                            na.rm = TRUE)+
      ggplot2::geom_segment(data = out,
                            ggplot2::aes_(y = ~id_out + 0.35,
                                          yend = ~id_out + 0.35,
                                          x = ~out_start,
                                          xend = ~out_end),
                            lty = 2,
                            na.rm = TRUE)
  }
  
  if(!is.null(seq_hmm)) {
    p <- p + 
      ggplot2::geom_rect(data = seq_hmm,
                         ggplot2::aes_(ymin = ~id_num - 0.25,
                                       ymax = ~id_num + 0.25,
                                       xmin = ~align_start,
                                       xmax = ~align_end,
                                       fill = ~domain),
                         colour = "grey20",
                         na.rm = TRUE)
  }
  
  
  if(!is.null(seq_signalp)) {
    p <- p +
      ggplot2::geom_segment(data = for_plot,
                            ggplot2::aes_(y = ~id_num,
                                          yend = ~id_num,
                                          x = 1,
                                          xend = ~sp.length,
                                          color = "N-sp"),
                            size = 2,
                            na.rm = TRUE)
  } 
  
  if(!is.null(seq_scan)) {
    p <- p +
      ggplot2::geom_rect(data = for_plot,
                         ggplot2::aes_(ymin = ~id_num - 0.18,
                                       ymax = ~id_num + 0.18,
                                       xmin = ~location.start,
                                       xmax = ~location.end,
                                       color = "AG span"),
                         fill = ag_col,
                         na.rm = TRUE) 
  } 
  
  if(!is.null(seq_hyp)) {
    p <- p +
      ggplot2::geom_errorbar(data = for_plot,
                             ggplot2::aes_(ymin = ~id_num - 0.18,
                                           ymax = ~id_num + 0.18,
                                           x  = ~P_pos,
                                           color = "hyp"),
                             size = 0.2,
                             width = 0,
                             na.rm = TRUE)
  }
  
  if(!is.null(seq_espritz)){	
    p <- p + ggplot2::geom_path(data = seq_espritz,	
                                ggplot2::aes_(x = ~residue,	
                                              y = ~y_plot,	
                                              group = ~inter),	
                                size = 0.7,	
                                color = "grey25") 	
  }

  if(!is.null(seq_gpi)) {
    p <- p +
      ggplot2::geom_point(data = for_plot,
                          ggplot2::aes_(y = ~id_num,
                                        x = ~omega_site,
                                        color = "omega site (gpi)"),
                          shape = gpi_shape,
                          size = gpi_size,
                          na.rm = TRUE) 
    
  } 
  
  if (any(subs)){
    p <- p +
      ggplot2::scale_color_manual("feature",
                                  limits = lims,
                                  values = vals,
                                  labels = labs)
    p <- p + 
      ggplot2::guides(color = ggplot2::guide_legend(keywidth = 1,
                                                    keyheight = 1,
                                                    override.aes = list(size = 5,
                                                                        shape = 15,
                                                                        linetype = 1)))
  }
  return(p)
}
missuse/ragp documentation built on Jan. 4, 2022, 10:49 a.m.