R/predict.R

Defines functions predict.ag_model

Documented in predict.ag_model

#' Predict amyloids
#'
#' Recognizes amyloids using AmyloGram algorithm.
#' @param object \code{ag_model} object.
#' @param newdata \code{list} of sequences (for example as given by
#' \code{\link[seqinr]{read.fasta}}).
#' @param ... further arguments passed to or from other methods.
#' @export
#' @examples
#' data(AmyloGram_model)
#' data(pep424)
#' predict(AmyloGram_model, pep424[c(4, 10)])

predict.ag_model <- function(object, newdata, ...) {
  if(any(!sapply(newdata, is_protein)))
    stop("Atypical aminoacid detected in input data.")
  
  seqs_m <- tolower(t(sapply(newdata, function(i)
    c(i, rep(NA, max(lengths(newdata)) - length(i))))))
  
  gl <- do.call(rbind, lapply(1L:nrow(seqs_m), function(i) {
    res <- do.call(rbind, strsplit(decode_ngrams(seq2ngrams(seqs_m[i, ][!is.na(seqs_m[i, ])], 6, a()[-1])), ""))
    cbind(res, id = paste0("P", rep(i, nrow(res))))
  }))
  
  bitrigrams <- as.matrix(count_multigrams(ns = c(1, rep(2, 4), rep(3, 3)),
                                           ds = list(0, 0, 1, 2, 3, c(0, 0), c(0, 1), c(1, 0)),
                                           seq = degenerate(gl[, -7], object[["enc"]]),
                                           u = as.character(1L:length(object[["enc"]]))))
  
  test_ngrams <- bitrigrams > 0
  storage.mode(test_ngrams) <- "integer"
  
  test_lengths <- lengths(newdata) - 5
  
  
  raw_preds <- predict(object[["rf"]],
                       data.frame(test_ngrams[, object[["imp_features"]],
                                              drop = FALSE]))[["predictions"]]
  preds <- data.frame(
    prob = if(is.matrix(raw_preds)) {
      raw_preds[, 2]
    } else {
      raw_preds[2]
    },
    prot = unlist(lapply(1L:length(test_lengths), function(i) rep(i, test_lengths[i])))
  )
  
  detailed_preds <- do.call(rbind, lapply(unique(preds[["prot"]]), function(single_prot) {
    hexapeptide_preds <- preds[preds[["prot"]] == single_prot, "prob"]
    pos_matrix <- do.call(cbind, get_ngrams_ind(length(hexapeptide_preds) + 5, 6, 0))
    data.frame(
      Protein = names(newdata)[single_prot],
      Aa = toupper(na.omit(seqs_m[single_prot, ])),
      Probability = unlist(lapply(unique(as.vector(pos_matrix)), function(i)
        max(hexapeptide_preds[which(pos_matrix == i, arr.ind = TRUE)[, "row"]])
      )))
  }))
  
  
  res <- list(overview = data.frame(Name = names(newdata),
                             Probability = vapply(unique(preds[["prot"]]), function(single_prot)
                               max(preds[preds[["prot"]] == single_prot, "prob"]),
                               0)),
       detailed = detailed_preds)
  
  class(res) <- "ag_prediction"
  
  res
}
michbur/AmyloGram documentation built on May 25, 2020, 7:13 a.m.