R/dom.R

#' @name domAbundance
#' @title Domain Abundance for each sample
#' @description Compute Pfam-A domain content dissimilarity between two or more
#' proteomes. 
#' @param fastas A \code{character} vector giving the file names of the amino 
#' acid fasta files.
#' @param pfamA \code{character} The path to the Pfam-A.hmm file
#' @param cut Parameter controlling model-specific thresholding. Can be ethier
#' \code{"ga"} or \code{"tc"}. See HMMER 3.1b2 manual for more information.
#' @param distMethod Dissimilarity index, partial match to "manhattan", 
#' "euclidean", "canberra", "bray" (DEFAULT), "kulczynski", "jaccard", "gower", 
#' "altGower", "morisita", "horn", "mountford", "raup" , "binomial", "chao", 
#' "cao" or "mahalanobis". See \link[vegan]{vegdist} for more information.
#' @param cpus \code{integer} The number of cpus to use.
#' @return A \code{list} object. The first element is the abundance matrix and
#' the second is the \code{dist} matrix.
# #' @importFrom vegan vegdist
#' @export
domAbundance <- function(fastas, 
                 pfamA, 
                 cut = 'ga', 
                 # distMethod = 'bray', 
                 cpus = 1L){
  
  #Eval - err
  if (missing(fastas)){
    stop('No fasta files provided.')
  }
  
  if(length(fastas)<2){
    stop('At least 2 fasta files must be provided.')
  }
  
  if (missing(pfamA)){
    stop('Pfam-A.hmm file path is not provided.')
  }
  
  if (Sys.which("hmmsearch")==""){
    stop("\n\tHMMER (v.3) is not installed. (Couldn't find 'hmmsearch' in $PATH)
         \nPlease install it before re-running dcda().\n\n")
  }
  
  #Get pfam-A ids
  cat('Retrieving information from Pfam-A.hmm.. ')
  stats <- hmmStat(hmmfile = pfamA)
  ids <- getIdsFromStats(stats = stats)
  file.remove(stats)
  cat('DONE!\n')
  
  #Press Pfam if not yet.
  idx <- paste0(pfamA, c('.h3f', '.h3i', '.h3m', '.h3p'))
  if (any(!file.exists(idx))){
    cat('Pfam-A.hmm is not indexed. Pressing Pfam-A.hmm.. ')
    hmmPress(model = pfamA)
    cat('DONE!\n')
  }
  
  #Hmmsearch
  cat('Searching.. ')
  mat <- parallel::mclapply(fastas, function(x){
    
    # tmp <- tempfile()
    hmmres <- hmmSearch(fasta = x, 
                        hmm = pfamA, 
                        cut = cut, 
                        oty = 'domtblout', 
                        n_threads = 0)
    dtbl <- readDomtblout(domtblout = hmmres)
    file.remove(hmmres)
    tab <- table(factor(dtbl$PfamID, levels = ids))
    tab
    
  }, mc.preschedule = FALSE, mc.cores = cpus)
  cat(' DONE!\n')
  
  #Bind and reduce
  mat <- do.call(rbind, mat)
  rownames(mat) <- sapply(strsplit(fastas, '/'), function(x){ rev(x)[1] })
  colnames(mat) <- ids
  mat <- mat[, -which(colSums(mat)==0)]
  
  return(mat)
}
iferres/misp_test documentation built on May 5, 2019, 12:32 p.m.