## This code is part of the megaptera package
## © C. Heibl 2014 (last update 2018-12-18)
#' @title Detect Homology Uncertainty in Alingment
#' @description Use GBLOCKS to detect positions of uncertain homology and remove
#' them from the alignment.
#' @param x An object of class \code{\link{megapteraProj}}.
#' @return None, \code{stepGBLOCKS} is called for its side effects.
#' @seealso \code{\link{megapteraProj}}; \code{\link{stepMAFFT}} for the
#' preceeding step and \code{\link{stepH}} for the subsequent step.
#' @export
#' @import DBI
#' @importFrom ips gblocks write.nex write.phy
stepGBLOCKS <- function(x){
start <- Sys.time()
## CHECKS
## ------
if (!inherits(x, "megapteraProj"))
stop("'x' is not of class 'megapteraProj'")
if (x@locus@kind == "undefined") stop("undefined locus not allowed")
## check if previous step has been run
## -----------------------------------
status <- dbProgress(x)
if (status$step_g == "pending") {
stop("the previous step has not been called yet")
}
if (status$step_g == "error") {
stop("the previous step has terminated with an error")
}
if (status$step_g == "failure") {
slog("\nNo data from upstream available - quitting", file = "")
return()
}
## Currently progress end with stepH (2018-02-26)
# if (status$step_h == "success") {
# dbProgress(x, "step_i", "error")
# }
## DEFINITIONS
## -----------
gene <- x@locus@sql
acc.tab <- paste("acc", gsub("^_", "", gene), sep = "_")
tip.rank <- x@taxon@tip.rank
msa.tab <- paste(tip.rank, "sequence", sep = "_")
block.max.dist <- x@params@block.max.dist
max.bp <- x@params@max.bp * 1.5
min.n.seq <- x@params@min.n.seq
gblocks.exe <- x@mask.exe
b1 <- x@params@gb1
b2 <- x@params@gb2
b3 <- x@params@gb3
b4 <- x@params@gb4
b5 <- x@params@gb5
## iniate logfile
## --------------
logfile <- paste0("log/", gene, "-stepI.log")
if (file.exists(logfile)) unlink(logfile)
slog(paste("\nmegaptera", packageDescription("megaptera")$Version),
paste("\n", Sys.time(), sep = ""),
"\nSTEP I: detecting nucleotide positions of uncertain homology\n",
paste("\nLocus:", x@locus@sql),
file = logfile)
## open database connection
## ------------------------
conn <- dbconnect(x)
## check if msa table exists
## -------------------------
if (!dbExistsTable(conn, msa.tab)){
dbDisconnect(conn)
slog("\nWARNING: table", msa.tab, "does not exist!\n", file = logfile)
td <- Sys.time() - start
slog("\nSTEP I finished after", round(td, 2), attr(td, "units"), "\n",
file = logfile)
return()
}
## check if at least 3 (ingroup) species are available
## ---------------------------------------------------
n <- paste("SELECT count(taxon)",
"FROM", msa.tab,
"WHERE", wrapSQL(gene, "locus", "="))
n <- dbGetQuery(conn, n)$count
if (n < 3){
dbDisconnect(conn)
slog("\nWARNING: only", n, "species available, no alignment possible\n",
file = logfile, megProj = x)
td <- Sys.time() - start
slog("\nSTEP GG finished after", round(td, 2), attr(td, "units"),
"\n", file = logfile, megProj = x)
dbProgress(x, "step_g", "failure")
return()
}
if (n < 100){ # 100 is arbitrary
n <- dbGetQuery(conn, paste("SELECT taxon FROM", msa.tab,
"WHERE", wrapSQL(gene, "locus", "=")))
n <- which(is.ingroup(x, n$taxon))
if (length(n) < 3){
dbDisconnect(conn)
slog("\nWARNING:", length(n), "ingroup species available, no alignment possible\n",
file = logfile, megProj = x)
td <- Sys.time() - start
slog("\nSTEP GG finished after", round(td, 2), attr(td, "units"),
"\n", file = logfile, megProj = x)
dbProgress(x, "step_g", "failure")
return()
}
}
## read alignment
## --------------
slog("\nReading alignment", file = logfile)
a <- dbReadMSA(x)
if (is.null(a)) {
dbDisconnect(conn)
slog("\nWARNING: no sequences conform to current parameter setting\n", file = logfile)
td <- Sys.time() - start
slog("\nSTEP I finished after", round(td, 2), attr(td, "units"), "\n",
file = logfile)
return()
}
## Masking of poorly aligned nucleotides
## -------------------------------------
slog("\nMasking poorly aligned regions:", file = logfile)
score <- gblocks(a, exec = gblocks.exe,
b1 = b1, b2 = b2, b3 = b3, b4 = b4, b5 = b5,
target = "score") # with least conservative default
## Write scores to database
## ------------------------
slog("\nWrite reliability scores to database ...", file = logfile)
dbWriteMSA(x, dna = a, score = score, status = "aligned")
slog("done", file = logfile)
## write files
## -----------
# slog("\n.. write alignment to files ..", file = logfile)
# if ( length(a) == 1 ) {
# write.phy(a[[1]], paste0("msa/", gene, "-masked.phy"))
# rownames(a[[1]]) <- gsub("-", "_", rownames(a[[1]]))
# write.nex(a[[1]], paste0("msa/", gene, "-masked.nex"))
# } else {
# for ( i in 1:length(a) ){
# b <- paste("block", i, sep = "")
# write.phy(a[[i]], paste0("msa/", gene, "-", b, "-masked.phy"))
# rownames(a[[i]]) <- gsub("-", "_", rownames(a[[i]]))
# write.nex(a[[i]], paste0("msa/", gene, "-", b, "-masked.nex"))
# }
# a <- do.call(cbind.DNAbin, c(a, fill.with.gaps = TRUE))
# b <- paste(length(n), "blocks", sep = "")
# write.phy(a, paste0("msa/", gene, "-masked.phy"))
# rownames(a) <- gsub("-", "_", rownames(a))
# write.nex(a, paste0("msa/", gene, "-masked.nex"))
# }
## summary of result
## -----------------
# slog(paste("\n\n--- final alignment of", gene, "---"),
# paste("\nnumber of sequences :", nrow(a)),
# paste("\nnumber of base pairs :", ncol(a)),
# file = logfile)
# if ( length(n) > 1 ) {
# slog("\nnumber of blocks :", length(n),
# "(", paste(n, collapse = ", "), ")",
# file = logfile)
# }
dbDisconnect(conn)
slog("\n\nSTEP I finished", file = logfile)
td <- Sys.time() - start
slog(" after", round(td, 2), attr(td, "units"), file = logfile)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.