#' @name doMLST
#' @title Perform MLST Analysis Over a List of Genomes
#' @description Takes a list of genome fasta files and perform blastn searches
#' to identify the sequence type for each of the genes/loci available in a mlst
#' scheme.
#' @param infiles A vector of genome sequence file paths in fasta format. Could
#' also be multifasta files of predicted Open Reading Frames.
#' @param org A valid organism from pubmlst.org. Run \link{listPubmlst_orgs} to
#' see available ones.
#' @param scheme \code{integer}. The scheme id number for a given organism. Run
#' \link{listPubmlst_schemes} to see available schemes for certain \code{org}.
#' @param schemeFastas A vector with the path to fasta sequences from each
#' loci of the specified mlst scheme. If it is \code{NULL} (default), sequences
#' are downloaded from \url{http://rest.pubmlst.org} to \code{dir}.
#' @param schemeProfile The path to the profile file (.tab). If left \code{NULL}
#' then it is downloaded from \url{http://rest.pubmlst.org} to \code{dir}.
#' @param write \code{character}. One of \code{"new"} (Default), \code{"all"} or
#' \code{"none"}. The fist one writes only new alleles found (not reported in
#' pubmlst.org), the second writes all alleles found, and "none" do not write
#' any file.
#' @param ddir A non-existing directory where to download the loci fasta files
#' in case they are not provided by the user. Default:
#' \code{paste0('pubmlst','_',org,'_',scheme)}
#' @param fdir A non-existing directory where to write fasta files of found
#' sequences (see \code{write}). (Default:
#' \code{paste0('alleles','_',org,'_',scheme)}).
#' @param n_threads \code{integer}. The number of cores to use. Each job consist
#' on the process for one genome. Blastn searches will use 1 core per job.
#' @param pid Percentage identity threshold to be consider as a novel allele. An
#' \code{integer} <= 100. (Default: 90).
#' @param scov Subject coverage threshold to be consider as a novel allele. A
#' \code{numeric} between 0 and 1. Not recomended to set it below 0.7 .
#' (Default 1.0)
#'# @details
#' @return An object of class "mlst", which consists on a list of 2 dataframes.
#' The first one is the alleles called for the \code{infiles} and the ST
#' detected, whereas the second is the scheme profile. The "result" data.frame
#' shows one genome per row and one gene from the selected scheme per column.
#' The last column is the ST detected for each genome. "NA" values means that
#' no allele were found in the respective genome. A 'u' (from 'unknown') plus
#' an integer means that the allele found was not reported in pubmlst.org
#' database; a fasta file with the new allele is written in this case if
#' \code{"write"} is set to "new" or "all" (see above). New alleles are
#' compared with each other so the names are properly chosen in case 2 or more
#' of those novel alleles were the same (in that cases, the names would be the
#' same). A number indicates the allele id number of the reported alleles in
#' pubmlst.org . The second data.frame is the profile definition chosen, the
#' logic is the same as the first one but in this case rows are the combination
#' of alleles reported at pubmlst.org . A series of attributes are also given
#' in the "mlst" object, mainly refering to the parameters used by the function.
#' @author Ignacio Ferres
#' @references Altschul, Gish, Miller, Myers & Lipman. (1990) "Basic local
#' alignment search tool." \emph{J. Mol. Biol}. \strong{215}:403-410.
#'
#' Jolley KA, Bray JE, Maiden MCJ. A RESTful application programming interface
#' for the PubMLST molecular typing and genome databases. Database: The Journal
#' of Biological Databases and Curation. 2017;2017:bax060.
#' doi:10.1093/database/bax060.
#' @importFrom parallel mclapply
#' @export
doMLST <- function(infiles,
org='leptospira',
scheme=1L,
schemeFastas=NULL,
schemeProfile=NULL,
write = 'new',
ddir = paste0('pubmlst','_',org,'_',scheme),
fdir = paste0('alleles','_',org,'_',scheme),
n_threads=1L,
pid=90L,
scov=1.0){
if (.Platform$OS.type!='unix'){
stop('Sorry, this package works only on unix-like platforms.')
}
if(Sys.which('blastn')==''){
stop('blastn binary is not in $PATH. Please install it before running this function.')
}
if (is.null(schemeFastas) | is.null(schemeProfile)){
if(!dir.exists(ddir)){
dir.create(ddir, recursive = FALSE)
ddir <- paste0(normalizePath(ddir),'/')
}else{
stop(paste0(ddir, ' already exists.'))
}
}
write <- match.arg(write, c('none', 'new', 'all'))
if (write%in%c('new', 'all')){
if(!dir.exists(fdir)){
#dont normalize path because in following steps directories are created
# recursively.
dir.create(fdir, recursive = FALSE)
}else{
stop(paste0(fdir, ' already exists.'))
}
}
if(pid>100){
stop('pid must be a integer smaller than 100.')
}
if(scov>1 | scov<0){
stop('scov must be between 0 and 1')
}else if(scov < 0.7){
warning('scov below 0.7 . Recomended: scov >= 0.7 . Continuing anyway..',
immediate. = T)
}
if(any(!file.exists(infiles))){
stop('One or more infiles do/es not exists in specified path.')
}
#Valid org?
if(!org%in%listPubmlst_orgs()){
stop(paste0(org,' is not in pubmlst database. Try listPubmlst_orgs() to see avaliables.'))
}
#Fasta files: If NULL, download; else, check if ok.
if(is.null(schemeFastas)){
cat(paste0('Downloading ',org,
' scheme ',scheme,
' MLST sequences at ',ddir,'/ .\n'))
schemeFastas <- downloadPubmlst_seq(org = org,
scheme=scheme,
dir = ddir,
n_threads = 1L)
}else{
if(any(!file.exists(schemeFastas))){
warning('One or more schemeFastas do/es not exists in specified path. Downloading..',
immediate. = T)
cat(paste0('Downloading ',org,
' scheme ',scheme,
' MLST sequences at ',ddir,'/ .\n'))
schemeFastas <- downloadPubmlst_seq(org = org,
scheme=scheme,
dir = ddir,
n_threads = 1L)
}else{
all(grepl('.fas$',schemeFastas)) -> ext
if (ext){
for(i in 1:length(schemeFastas)){
readLines(schemeFastas[i])-> rl
rl[grep('>',rl)]->rl
all(sapply(strsplit(rl,'_'),length)>=2) -> three
if(!three){
stop("Sequence headers doesn't have the expected format. Please download them directly
from pubmlst.org or use the supplied R functions in this package.")
}
}
}else{
stop('Sequences should have the ".fas" extension. Please download them directly from
pubmlst.org or use the supplied R functions in this package.')
}
}
}
#Scheme profile file: If NULL, download; else, check if ok.
if (is.null(schemeProfile)){
cat(paste0('Downloading ',org,
' scheme ',scheme,
' MLST profile at ',ddir,'/ .\n'))
schemeProfile <- downloadPubmlst_profile(org = org,
scheme = scheme,
dir = ddir)
prof <- read.csv(schemeProfile,sep = '\t',header = T, colClasses = 'character')
#Patch to avoid some strange publst profiles issues
if (!'clonal_complex'%in%colnames(prof)){
ap <- apply(prof, 2, function(y){all(is.na(y))})
if(any(ap)){
prof[, which(ap)] <- rownames(prof)
names(prof) <- names(prof)[c(2:dim(prof)[2], 1)]
}
}
}else{
if(!file.exists(schemeProfile)){
warning('schemeProfile file missing, downloading..',immediate. = T)
cat(paste0('Downloading ',org,
' scheme ',scheme,
' MLST profile at ',ddir,'/ .\n'))
schemeProfile <- downloadPubmlst_profile(org = org,
scheme = scheme,
dir = ddir)
prof <- read.csv(schemeProfile,sep = '\t',header = T, colClasses = 'character')
}else{
prof <- read.csv(schemeProfile,sep = '\t',header = T, colClasses = 'character')
fils <- sub('.fas$','',sapply(schemeFastas,function(x){rev(strsplit(x,'/')[[1]])[1]}))
if (!all(fils%in%colnames(prof))){
stop("The scheme fasta files doesn't correspond to the supplied profile file.
(One or more file names are not represented in the profile's column names)")
}
}
}
if (!is.null(schemeFastas)){
#check if blast databases exists
dds <- lapply(sub('fas$', '', schemeFastas), function(x){
paste0(x, c('nsq', 'nin', 'nhr'))
})
dds<- unlist(dds)
if (!all(file.exists(dds))){
#Make BLAST DATABASEs
cat('Making BLAST databases...')
parallel::mclapply(schemeFastas,function(x){
makeblastdb(infile = x)
},mc.preschedule = F,mc.cores = n_threads) -> dbs
unlist(dbs) -> dbs
cat(' DONE!\n')
}else{
dbs <- sub('[.]fas$', '', schemeFastas)
}
}else{
#Make BLAST DATABASEs
cat('Making BLAST databases...')
parallel::mclapply(schemeFastas,function(x){
makeblastdb(infile = x)
},mc.preschedule = F,mc.cores = n_threads) -> dbs
unlist(dbs) -> dbs
cat(' DONE!\n')
}
#Determine mlst for each genome
normalizePath(infiles) -> infiles
cat('Running BLASTN...')
resu <- parallel::mclapply(infiles,function(x){
mlst(genome = x,
dbs = dbs,
write = write,
prefix = fdir,
dir = getwd(),
# dnw = dnw,
n_threads = 1L,
outf = tempdir(),
pid = pid,
scov = scov)
}, mc.preschedule = T,mc.cores = n_threads)
cat(' DONE!\n')
resu <- do.call(rbind,resu)
rownames(resu) <- sub('[.]\\w+$','',basename(infiles))
resu <- as.data.frame(resu, stringsAsFactors = FALSE)
resu <- processResu(resu = resu,
write = write,
dbs = dbs,
dir = paste0(getwd(),'/'),
prefix = fdir)
#There are some cases where a ST doesn't have all the alleles, particularly in cgMLST.
if(length(which(prof=='N'))>0){
prof[which(prof=='N', arr.ind = TRUE)] <- 'NA'
}
#Convert everything to character to allow comparisons:
resu[] <- lapply(resu, as.character)
# prof[] <- lapply(prof, as.character)
sprof <- prof[, colnames(resu)]
#Detect ST
ty <- c('ST', 'cgST')[c('ST', 'cgST')%in%colnames(prof)]
if (ty=='ST'){
ig <- apply(resu,1,function(x){
prof$ST[which(apply(sprof,1,function(y){
identical(x, y)
}))]
})
}else{
ig <- apply(resu,1,function(x){
prof$cgST[which(apply(sprof,1,function(y){
identical(x, y)
}))]
})
colnames(prof) <- sub('^cgST$', 'ST', colnames(prof))
}
ig[!sapply(ig,length)] <- NA
if(length(ig)>0){
resu$ST <- unlist(ig)
}else{
resu$ST <- NA
}
#Return
prof <- prof[, colnames(resu)]
out <- list(result = resu,
profile = prof)
attr(out, 'infiles') <- basename(infiles)
attr(out, 'org') <- org
attr(out, 'scheme') <- scheme
attr(out, 'write') <- write
attr(out, 'pid') <- pid
attr(out, 'scov') <- scov
attr(out, 'class') <- 'mlst'
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.