motiftools: Tools to analyze sequence motifs

Documented in align motifAlign

#' Align two sequences
#' 
#' Align two sequences using local (Smith-Waterman) or global (Needleman-Wunsch) sequence alignment.
#' 
#' @param x1 pattern sequence: a character vector, scalar or AAString.
#' @param x2 subject sequence: a character vector, scalar or AAString.
#' @param score.matrix substitution matrix.
#' @param gap.score gap score.
#' @param type type of alignment: global or local (default: local).
#' @param debug logical; whether to return debugging data.
#' 
#' @return AlignResult object.
#' @export
#' 
#' @examples
#' library(motiftools)
#' align("ALVDE", "AVRES", score.matrix = "BLOSUM62")
align <- function(x1, x2, score.matrix = NULL, gap.score = -1, type = "local", debug = FALSE) {
  # process input sequences.
  x1 <- as.character(x1)
  x2 <- as.character(x2)
  
  if (length(x1) == 1 && length(x2) == 1) {
    x1 <- strsplit(x1, "")[[1]]
    x2 <- strsplit(x2, "")[[1]]
  }
  
  # check alignment type.
  type <- match.arg(type, c("global", "local", "global_old", "local_old"))
  
  # get scoring matrix.
  if (is.vector(score.matrix) & typeof(score.matrix) == "character")
    score.matrix <- getScoreMatrix(score.matrix)

  switch(type,
         "local" = {
           sw(x1, x2, score_matrix = score.matrix, gap_score = gap.score, debug = debug)
         },
         "global" = {
           nw(x1, x2, score_matrix = score.matrix, gap_score = gap.score, debug = debug)
         },
         "local_old" = {
           .sw(x1, x2, score.matrix = score.matrix, gap.score = gap.score, debug = debug)
         },
         "global_old" = {
           .nw(x1, x2, score.matrix = score.matrix, gap.score = gap.score, debug = debug)
         })
}

.checkMax <- function(x) x[which.max(x)]

.generateScoreMatrix <- function(x, match.score = 1, mismatch.score = -1) {
  x <- unique(x)
  m <- matrix(mismatch.score, ncol = length(x), nrow = length(x), dimnames = list(x,x))
  diag(m) <- match.score
  m
}

.sw <- function(x1, x2, score.matrix, gap.score = -1,debug = FALSE) {
  l1 <- length(x1)
  l2 <- length(x2)
  
  if(missing(score.matrix))
    score.matrix <- .generateScoreMatrix(c(x1, x2))
  
  # initialize.
  m <- matrix(NA, ncol = l1 + 1, nrow = l2 + 1)
  colnames(m) <- c("", x1)
  rownames(m) <- c("", x2)
  m[1, ] <- 0
  m[, 1] <- 0
  
  # fill
  res <- list()
  res <- c(res, lapply(2:ncol(m), function(j) data.frame(row = 1, col = j, ii = 1, jj = j - 1, type = "h")))
  res <- c(res, lapply(2:nrow(m), function(i) data.frame(row = i, col = 1, ii = i - 1, jj = 1, type = "v")))
  for (i in 2:nrow(m)) {
    for (j in 2:ncol(m)) {
      # diagonal.
      d <- m[i - 1, j - 1] + score.matrix[x1[j - 1], x2[i - 1]]
      # horizonal
      h <- m[i, j - 1] + gap.score
      # vertical
      v <- m[i - 1, j] + gap.score
      # assign.
      tmp <- .checkMax(c(diag = d, hori = h, vert = v))
      if (tmp < 0) tmp[] <- 0 # no negative scores.
      if (tmp > 0) {
        switch(names(tmp),
               diag = {
                 res <- c(res, list(data.frame(row = i, col = j, ii = i - 1, jj = j - 1, type = "d")))
               },
               hori = {
                 res <- c(res, list(data.frame(row = i, col = j, ii = i, jj = j - 1, type = "h")))
               },
               vert = {
                 res <- c(res, list(data.frame(row = i, col = j, ii = i - 1, jj = j, type = "v")))
               }
        )
      }
      m[i, j] <- tmp
    }
  }
  res <- do.call(rbind, res)
  
  # backtrace
  index <- which(m == max(m), arr.ind = TRUE)
  index <- index[nrow(index), ] # there may be several max-- get the last one!
  i <- index[1]
  j <- index[2]
  r1 <- c()
  r2 <- c()
  st <- 0
  repeat {
    if(m[i, j] == 0) break
    
    tmp <- subset(res, row == i & col == j)
    switch(as.character(tmp$type),
           d = {
             r1 <- c(r1, x1[j - 1])
             r2 <- c(r2, x2[i - 1])
             st <- st + m[i, j]
             i <- i - 1
             j <- j - 1
           },
           v = {
             r1 <- c(r1, "-")
             r2 <- c(r2, x2[i - 1])
             i <- i - 1
           },
           h = {
             r1 <- c(r1, x1[j - 1])
             r2 <- c(r2, "-")
             j <- j - 1
           }
    )
  }
  
  # build alignment.
  if (length(r1) > 0) {
    al <- data.frame(rbind(rev(r1), rev(r2)), row.names = c("s1", "s2"), check.names = FALSE)
    colnames(al) <- 1:ncol(al)
  }
  else
    al <- data.frame()
  
  # results.
  if (debug)
    list(score.matrix = score.matrix, gap.score = gap.score, scores = m, traceback = res, sequences = list(s1 = x1, s2 = x2), score = st, alignment = al)
  else
    list(alignment = al, score = st)
}


.nw <- function(x1, x2, score.matrix, gap.score = -1, debug = FALSE) {
  l1 <- length(x1)
  l2 <- length(x2)
  
  if (missing(score.matrix))
    score.matrix <- .generateScoreMatrix(c(x1, x2))
  
  # initialize.
  m <- matrix(NA, ncol = l1 + 1, nrow = l2 + 1)
  colnames(m) <- c("", x1)
  rownames(m) <- c("", x2)
  m[1, ] <- -1 * (0:l1)
  m[, 1] <- -1 * (0:l2)
  
  # fill
  res <- list()
  res <- c(res, lapply(2:ncol(m), function(j) data.frame(row = 1, col = j, ii = 1, jj = j - 1, type = "h")))
  res <- c(res, lapply(2:nrow(m), function(i) data.frame(row = i, col = 1, ii = i - 1, jj = 1, type = "v")))
  for (i in 2:nrow(m)) {
    for (j in 2:ncol(m)) {
      # diagonal.
      d <- m[i - 1, j - 1] + score.matrix[x1[j - 1], x2[i - 1]]
      # horizonal
      h <- m[i, j - 1] + gap.score
      # vertical
      v <- m[i - 1, j] + gap.score
      # assign.
      tmp = .checkMax(c(diag = d, hori = h, vert = v))
      switch(names(tmp),
             diag = {
               res <- c(res, list(data.frame(row = i, col = j, ii = i - 1, jj = j - 1, type = "d")))
             },
             hori = {
               res <- c(res, list(data.frame(row = i, col = j, ii = i, jj = j - 1, type = "h")))
             },
             vert = {
               res <- c(res, list(data.frame(row = i, col = j, ii = i - 1, jj = j, type = "v")))
             }
      )
      m[i, j] <- tmp
    }
  }
  res <- do.call(rbind, res)
  
  # backtrace
  i <- nrow(m)
  j <- ncol(m)
  r1 <- c()
  r2 <- c()
  st <- 0
  repeat {
    tmp <- subset(res, row == i & col == j)
    switch(as.character(tmp$type),
           d = {
             r1 <- c(r1, x1[j-1])
             r2 <- c(r2, x2[i-1])
             st <- st + m[i, j]
             i <- i - 1
             j <- j - 1
           },
           v = {
             r1 <- c(r1, "-")
             r2 <- c(r2, x2[i - 1])
             i <- i - 1
           },
           h = {
             r1 <- c(r1, x1[j - 1])
             r2 <- c(r2, "-")
             j <- j - 1
           }
    )
    if(i == 1 && j == 1) break
  }
  
  # build alignment.
  al <- data.frame(rbind(rev(r1), rev(r2)), row.names = c("s1", "s2"), check.names = FALSE)
  
  # results.
  if (debug)
    list(score.matrix = score.matrix, gap.score = gap.score, scores = m, traceback = res, sequences = list(s1 = x1, s2 = x2), score = st, alignment = al)
  else
    list(alignment = al, score = st)
}

#' Align two motifs represented as PWMs
#' 
#' Takes to motifs represented by PWMs and computes an alignment using some 
#' measure of motif similarity as scoring method. By default this is Pearson 
#' Correlation Coefficient (PCC). Alternatively can be Euclidean distance.
#'
#' @param x1 motif 1.
#' @param x2 motif 2.
#' @param align.method align method, one of "global" or "local" (default = local).
#' @param score.method scoring method, one of "PCC" or "euclid" (default = PCC).
#' @param debug logical; whether to show debugging information.
#'
#' @return AlignResult object.
#' @export
#'
#' @examples
#' NULL
motifAlign <- function(x1, x2, align.method = "local", score.method = "PCC", debug = FALSE) {
  score.method <- match.arg(score.method, c("PCC", "euclid"))
  
  # rename columns for motifs A and B.
  m1 <- x1
  m2 <- x2
  colnames(m1) <- paste0("A", 1:ncol(m1))
  colnames(m2) <- paste0("B", 1:ncol(m2))
  
  # generate motifs column scoring matrix.
  switch(score.method,
         "PCC" = {
           s <- cor(m1, m2)
         },
         "euclid" = {
           m <- cbind(m1, m2)
           s <- as.matrix(dist(t(m), diag = TRUE, upper = TRUE))
         })
  
  # align columns.
  align(colnames(m1), colnames(m2), score.matrix = s, debug = debug, type = align.method)
}

motifDistance <- function(x1, x2, score.method = "PCC", align.method = "local", debug = FALSE) {
  motifAlign(x1, x2, score.method = score.method, align.method = align.method, debug = debug)$score
}