#' update_phylota(ingroup = "centrolene", outgroup = "cochranella",
#' genes = c("12S","RAG1", "ND1","16S"), MSA=T, ALI=T)
update_phylota <-
function(ingroup,
outgroup,
genes = NULL,
MSA = FALSE,
ALI = FALSE) {
##New version
options(warn = -1)
clade <- ingroup
fn <- "Unaligned"
fn2 <- "Aligned"
fn2.1 <- "Aliscore"
fn3 <- "phylota.sampling.csv"
fn4 <- "included.species.csv"
if (file.exists(fn))
unlink(fn, recursive = T)
if (file.exists(fn2))
unlink(fn2, recursive = T)
if (file.exists(fn2.1))
unlink(fn2.1, recursive = T)
if (file.exists(fn3))
unlink(fn3, recursive = T)
if (file.exists(fn4))
unlink(fn4, recursive = T)
if (length(grep("Molecular_*", list.files("."), value = T)) == 0) {
} else {
unlink(grep("Molecular_*", list.files("."), value = T))
}
cat("\n Get PhyLota clusters \n")
piphy::get_phylota(ingroup, MSA = F, ALI = F)
mainDirect <- getwd()
mainDir<-mainDirect
subwd <- paste0(mainDirect, "/Unaligned/")
##Check for new species in genbank
cat("Checking for novel species in genbank \n")
spp_genbank <-
taxize::downstream(ingroup, db = "ncbi", downto = 'species')[[1]]
spp_sampled <-read.csv(list.files(
path = subwd,
pattern = c("Sampling_matrix"),
full.names = T
))
new_spp <-
setdiff(gsub(" ", "_", spp_genbank$childtaxa_name), spp_sampled$Species)
if (length(new_spp) == 0) {
if (file.exists(fn))
unlink(fn, recursive = T)
stop("Nothing to be added")
} else{
cat("Tranforming each cluster into a Blast DB \n")
##First make DB for each cluster
files <- list.files(path = subwd, pattern = c("Cluster"))
files <- Filter(function(x)
grepl("\\.fasta$", x), files)
subwd<- if(Sys.info()[['sysname']]=="Windows") {
gsub("\\", "/", subwd, fixed=T) } else {subwd}
for (i in 1:length(files)) {
args <-
paste0("-in ", subwd, files[i], " -input_type fasta -dbtype nucl")
system2(
command = 'makeblastdb',
args = args,
wait = FALSE,
stdout = TRUE
)
Sys.sleep(1)
print(i)
}
cat(
length(new_spp),
" species can be added to the ",
dim(spp_sampled)[1],
"that are already sampled in PhyloTa \n"
)
sampled_gene_names <- genes
##Then run each sequence against each cluster
##Start!
##
new_spp2 <- if (is.null(outgroup) == T) {
new_spp
} else{
new_spp <- c(new_spp, outgroup)
}
data <-
scrape.genbank(gsub("_", " ", new_spp), sampled_gene_names)
###Now, I download each sequence and test where it fits better
cat("\nFitting the new sequences into the existing clusters \n")
acce_new <- unique(as.character(na.omit(unlist(data[, -1]))))
if (length(acce_new) == 0) {
stop("No new sequences found")
}
n_clust <- length(list.files(path = subwd, pattern = c("csv")))
species_included <- list()
for (i in 1:length(acce_new)) {
sequence <- read.GenBank(acce_new[i])
Acc_spp <- names(sequence)
names(sequence) <- attr(sequence, "species")
for (j in 1:n_clust) {
write.dna(sequence, "sequence.fasta", format = "fasta")
que <-
paste0("-outfmt 6 -query sequence.fasta -out test.txt -db ",
subwd,
files[j])
system2(
command = 'blastn',
args = que,
wait = FALSE,
stdout = TRUE
)
if (file.info("test.txt")$size > 0) {
cluster <- ape::read.dna(paste0(subwd, files[j]), format = "fasta")
cluster[length(cluster) + 1] <- sequence
if( names(sequence) %in% names(cluster) ){}else{
names(cluster)[length(cluster)] <- names(sequence)
write.dna(cluster, paste0(subwd, files[j]), format = "fasta")
species_included[[i]] <-
data.frame(
Included_species = names(sequence),
AN = Acc_spp,
Cluster = files[j]
)
cat("*******Sequence matched!******* \n")
break ##If sequence is matched, we shold stop.
}
} else {
cat("****Still working \n")
}
}
cat("**Done with sequence",
i,
"of",
length(acce_new),
"****\n")
}
df <- do.call("rbind", species_included)
if (file.exists("sequence.fasta"))
unlink("sequence.fasta")
if (file.exists("test.txt"))
unlink("test.txt")
}
cat("\n Retrieving accession numbers \n")
##Need to include the new seqs to the cluster matrix
addedseqs<-reshape(df, idvar = "Included_species", timevar = "Cluster", direction = "wide")
colnames(addedseqs)<-c("Species", paste0("Cluster_",as.numeric(gsub("[^\\d]+", "", colnames(addedseqs)[-1], perl=TRUE))))
final_sampling<-as.data.frame(rbindlist(list(spp_sampled, addedseqs), fill = TRUE))[,-1]
write.csv( final_sampling,"Unaligned/Sampling_matrix_unaligned.csv" )
setwd(mainDirect)
file.remove(
list.files(
".",
pattern = ".fasta.",
include.dirs = F,
full.names = T,
recursive = T
)
)
if (MSA == TRUE) {
alignments<- align_piphy(mainDir = mainDir, clade=ingroup)
write.csv(final_sampling, "Sampling_matrix_aligned.csv")
setwd(mainDir)
if (ALI == TRUE) {
setwd(mainDir)
sm<-ali_piphy(mainDir, alignments, sm=final_sampling, clade=ingroup)
write.csv(sm, "Sampling_matrix_aliscore.csv")
setwd(mainDir)
unlink(list.dirs()[grep("Aliscore_", list.dirs(), ignore.case = T)], recursive =
TRUE)
}
}
cat("\nDone!!")
options(warn = 0)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.