R/desc-15-PSSMAcc.R

Defines functions extractPSSMAcc

Documented in extractPSSMAcc

#' Profile-based protein representation derived by PSSM
#' (Position-Specific Scoring Matrix) and auto cross covariance
#'
#' This function calculates the feature vector based on the PSSM
#' by running PSI-Blast and auto cross covariance tranformation.
#'
#' @param pssmmat The PSSM computed by \code{\link{extractPSSM}}.
#' @param lag The lag parameter. Must be less than the number of amino acids
#' in the sequence (i.e. the number of columns in the PSSM matrix).
#'
#' @return A length \code{lag * 20^2} named numeric vector,
#' the element names are derived by the amino acid name abbreviation
#' (crossed amino acid name abbreviation) and lag index.
#'
#' @seealso \link{extractPSSM} \link{extractPSSMFeature}
#'
#' @author Nan Xiao <\url{https://nanx.me}>
#'
#' @export extractPSSMAcc
#'
#' @references
#' Wold, S., Jonsson, J., Sjorstrom, M., Sandberg,
#' M., & Rannar, S. (1993).
#' DNA and peptide sequences and chemical processes multivariately modelled
#' by principal component analysis and partial least-squares projections
#' to latent structures.
#' \emph{Analytica chimica acta}, 277(2), 239--253.
#'
#' @examples
#' if (Sys.which("makeblastdb") == "" | Sys.which("psiblast") == "") {
#'   cat("Cannot find makeblastdb or psiblast. Please install NCBI Blast+")
#' } else {
#'   x <- readFASTA(system.file(
#'     "protseq/P00750.fasta",
#'     package = "protr"
#'   ))[[1]]
#'   dbpath <- tempfile("tempdb", fileext = ".fasta")
#'   invisible(file.copy(from = system.file(
#'     "protseq/Plasminogen.fasta",
#'     package = "protr"
#'   ), to = dbpath))
#'
#'   pssmmat <- extractPSSM(seq = x, database.path = dbpath)
#'   pssmacc <- extractPSSMAcc(pssmmat, lag = 3)
#'   tail(pssmacc)
#' }
extractPSSMAcc <- function(pssmmat, lag) {

  # Normalize PSSM scores to (0, 1)
  mat <- 1 / (1 + exp(pssmmat))

  accpssm <- function(mat, lag) {
    p <- nrow(mat)
    n <- ncol(mat)

    if (lag > n) {
      stop("lag must be smaller than the amino acids in the sequence")
    }

    # auto covariance: p elements

    acc1 <- matrix(0, nrow = p, ncol = lag)

    for (j in 1:p) {
      for (i in 1:lag) {
        acc1[j, i] <-
          sum(mat[j, 1:(n - i)] * mat[j, ((1:(n - i)) + i)]) / (n - i)
      }
    }

    acc1 <- as.vector(acc1)

    AADict <- c(
      "A", "R", "N", "D", "C", "Q", "E", "G", "H", "I",
      "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"
    )
    names(acc1) <- as.vector(outer(
      AADict, paste0("lag", 1:lag), paste,
      sep = "."
    ))

    # auto cross covariance: p^2 - p elements

    acc2 <- matrix(0, nrow = p^2 - p, ncol = lag)

    idx <- cbind(combn(1:p, 2), combn(1:p, 2)[2:1, ])

    for (j in 1:ncol(idx)) {
      for (i in 1:lag) {
        acc2[j, i] <-
          sum(mat[idx[1, j], 1:(n - i)] * mat[idx[2, j], ((1:(n - i)) + i)]) / (n - i)
      }
    }

    acc2 <- as.vector(acc2)
    names(acc2) <- as.vector(outer(
      paste(AADict[idx[1, ]], AADict[idx[2, ]], sep = "."),
      paste0("lag", 1:lag), paste,
      sep = "."
    ))

    ACC <- c(acc1, acc2)
    ACC
  }

  res <- accpssm(mat, lag)
  res
}

Try the protr package in your browser

Any scripts or data that you put into this service are public.

protr documentation built on Nov. 2, 2023, 6:04 p.m.