update_phylota <-
function(lineage,
nsamples = 5,
genes = NULL,
MSA = TRUE,
ALI = TRUE,
outgroup = NULL,
correct_db = TRUE,
delete_all = TRUE,
c_directory=FALSE) {
##New version
options(warn = -1)
Clade <- lineage
clade <- lineage
fn <- "Unaligned"
fn2 <- "Aligned"
fn2.1 <- "Aliscore"
fn3 <- "phylota.sampling.csv"
fn4 <- "included.species.csv"
if (file.exists(fn))
unlink(fn, recursive = T)
if (file.exists(fn2))
unlink(fn2, recursive = T)
if (file.exists(fn2.1))
unlink(fn2.1, recursive = T)
if (file.exists(fn3))
unlink(fn3, recursive = T)
if (file.exists(fn4))
unlink(fn4, recursive = T)
if (length(grep("Molecular_*", list.files("."), value = T)) == 0) {
} else {
unlink(grep("Molecular_*", list.files("."), value = T))
}
cat("\n Get PhyLota clusters \n")
rPhyloTa::get_PHYLOTA(lineage, MSA = F, ALI = F, c_directory= c_directory)
mainDirect <- ifelse(c_directory == FALSE, tempdir() , getwd())
mainDir<- ifelse(c_directory == FALSE, tempdir() , getwd())
subwd <- paste0(mainDirect, "/Unaligned/")
##Check for new species in genbank
cat("Checking for novel species in genbank \n")
spp_genbank <-
taxize::downstream(lineage, db = "ncbi", downto = 'species')[[1]]
spp_sampled <-
do.call(rbind, lapply(list.files(
path = subwd,
pattern = c(".csv"),
full.names = T
), read.csv))
spp_sampled <- spp_sampled[!duplicated(spp_sampled$name), ]
new_spp <-
setdiff(gsub(" ", "_", spp_genbank$childtaxa_name), spp_sampled$name)
if (length(new_spp) == 0) {
if (file.exists(fn))
unlink(fn, recursive = T)
stop("Nothing to be added")
} else{
cat("Tranforming each cluster into a Blast DB \n")
##First make DB for each cluster
files <- list.files(path = subwd, pattern = c("Cluster"))
files <- Filter(function(x)
grepl("\\.fasta$", x), files)
subwd<- if(Sys.info()[['sysname']]=="Windows") {
gsub("\\", "/", subwd, fixed=T) } else {subwd}
for (i in 1:length(files)) {
args <-
paste0("-in ", subwd, files[i], " -input_type fasta -dbtype nucl")
system2(
command = 'makeblastdb',
args = args,
wait = FALSE,
stdout = TRUE
)
Sys.sleep(1)
print(i)
}
cat(
length(new_spp),
" species can be added to the ",
dim(spp_sampled)[1],
"that are already sampled in PhyloTa \n"
)
##Check which genes do I have in PhyloTa clusters. This is based on 5 species per each cluster.
if (is.null(genes) == T) {
cat("Looking for genes sampled in PhyLota \n")
sample_Gi <- do.call(rbind, lapply(list.files(path = subwd,
pattern = c("csv"), full.names = T), read.csv, nrow = nsamples))
sampled_genes <- ncbi_byid(gsub("gi", "", as.vector(sample_Gi$gi)))
choosebank("genbank")
gene_fin <- list()
for (i in 1:length(sampled_genes$acc_no)) {
Dengue1 <-
query("Dengue1", paste0("AC=", gsub(
"\\..*", "", sampled_genes$acc_no[i]
)))
annots <- getAnnot(Dengue1$req[[1]])
a <- gsub("/gene=\"", "", grep("/gene=", annots, value = T))
b <- gsub("\"", "", a)
gene_fin[[i]] <- gsub("[[:space:]]", "", b)[1]
cat("New gene symbol found!",
gsub("[[:space:]]", "", b)[1],
"\n")
}
closebank()
sampled_gene_names <-levels(factor(unlist(gene_fin)))
cat("\n Your sampling includes", levels(factor(unlist(gene_fin))), "loci \n")
} else{
sampled_gene_names <- genes
}
##Then run each sequence against each cluster
#some functions
grab.results <- function (term) {
# Search for the given term on nuccore. This gives us a list of
# record IDs.
ids <- esearch(term, db="nuccore")
# Grab summaries for the given record IDs, as a sort-of data frame.
sum <- esummary(ids, db="nuccore")
data <- content(sum, as="parsed")
# For some reason, this parser gives us lists of lists instead of a
# proper data frame (which should be lists of vectors). Return a
# fixed-up version.
data.frame(lapply(data, as.character), stringsAsFactors=FALSE)
}
#######################################################################
# Takes a dataframe of Genbank results and a keyword string and returns a
# string containing an accession number. This is the accession number of
# the first result in the table that had the keyword in its Title field.
first.of.type <- function (results, type) {
# Filter rows whose title contains the given text
filtered <- results[grepl(type, results$Title, fixed=TRUE),]
# Return the first accession number
filtered$OSLT[1]
}
#######################################################################
# Given a list of species and genes, return a dataframe of accession numbers.
#
# Takes string vectors as arguments and returns a dataframe. The rows of the
# dataframe are species. There is a column for each gene. These columns are
# filled with accession numbers. The species names are put in a column called
# "Species".
scrape.genbank <- function (species, genes) {
# Create dataset skeleton
data <- data.frame(Species=species)
for (i in genes) {
data[,i] <- rep(NA, length(species))
}
# Look up data for each species
for (i in 1:length(species)) {
n <- species[i]
print(sprintf("Looking up for %s (%d/%d)...", n, i, length(species)))
tryCatch({
r <- grab.results(paste(n, "[Primary Organism]"))
for (g in genes) {
data[i,g] <- first.of.type(r, g)
}
}, error = function(e) { })
}
data
}
#######################################################################
# Genbank scraping functions
# Copyright (C) 2015 Anusha Beer <anbeer29@gmail.com>
#
# Permission to use, copy, modify, and/or distribute this software for
# any purpose with or without fee is hereby granted, provided that the
# above copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
# Search Genbank for search term e.g. species name. Returns a dataframe
# of all available (nucleotide) sequences on Genbank.
#
# Give it a string e.g. species name. Returns a dataframe. Each row is a
# result and each column is an attribute of the results. Some of the imprtant
# attributes are "Title", "OSLT". Title contains a description of the
# sequence. OSLT contains acession number.
grab.results <- function (term) {
# Search for the given term on nuccore. This gives us a list of
# record IDs.
ids <- esearch(term, db="nuccore")
# Grab summaries for the given record IDs, as a sort-of data frame.
sum <- esummary(ids, db="nuccore")
data <- content(sum, as="parsed")
# For some reason, this parser gives us lists of lists instead of a
# proper data frame (which should be lists of vectors). Return a
# fixed-up version.
data.frame(lapply(data, as.character), stringsAsFactors=FALSE)
}
#######################################################################
# Takes a dataframe of Genbank results and a keyword string and returns a
# string containing an accession number. This is the accession number of
# the first result in the table that had the keyword in its Title field.
first.of.type <- function (results, type) {
# Filter rows whose title contains the given text
filtered <- results[grepl(type, results$Title, fixed=TRUE),]
# Return the first accession number
filtered$OSLT[1]
}
#######################################################################
# Given a list of species and genes, return a dataframe of accession numbers.
#
# Takes string vectors as arguments and returns a dataframe. The rows of the
# dataframe are species. There is a column for each gene. These columns are
# filled with accession numbers. The species names are put in a column called
# "Species".
scrape.genbank <- function (species, genes) {
# Create dataset skeleton
data <- data.frame(Species=species)
for (i in genes) {
data[,i] <- rep(NA, length(species))
}
# Look up data for each species
for (i in 1:length(species)) {
n <- species[i]
print(sprintf("Looking up for %s (%d/%d)...", n, i, length(species)))
tryCatch({
r <- grab.results(paste(n, "[Primary Organism]"))
for (g in genes) {
data[i,g] <- first.of.type(r, g)
}
}, error = function(e) { })
}
data
}
##Start!
##
new_spp2 <- if (is.null(outgroup) == T) {
new_spp
} else{
new_spp <- c(new_spp, outgroup)
}
data <-
scrape.genbank(gsub("_", " ", new_spp), sampled_gene_names)
###Now, I download each sequence and test where it fits better
cat("\nFitting the new sequences into the existing clusters \n")
acce_new <- as.character(na.omit(unlist(data[, -1])))
if (length(acce_new) == 0) {
stop("No new sequences found")
}
n_clust <- length(list.files(path = subwd, pattern = c("csv")))
species_included <- list()
for (i in 1:length(acce_new)) {
sequence <- read.GenBank(acce_new[i])
Acc_spp <- names(sequence)
names(sequence) <- attr(sequence, "species")
for (j in 1:n_clust) {
write.dna(sequence, "sequence.fasta", format = "fasta")
que <-
paste0("-outfmt 6 -query sequence.fasta -out test.txt -db ",
subwd,
files[j])
system2(
command = 'blastn',
args = que,
wait = FALSE,
stdout = TRUE
)
if (file.info("test.txt")$size > 0) {
cluster <- read.dna(paste0(subwd, files[j]), format = "fasta")
cluster[length(cluster) + 1] <- sequence
names(cluster)[length(cluster)] <- names(sequence)
write.dna(cluster, paste0(subwd, files[j]), format = "fasta")
species_included[[i]] <-
data.frame(
Included_species = names(sequence),
AN = Acc_spp,
Cluster = files[j]
)
cat("*******Sequence matched!******* \n")
break ##If sequence is matched, we shold stop.
} else {
cat("****Still working \n")
}
}
cat("**Done with sequence",
i,
"of",
length(acce_new),
"****\n")
}
df <- do.call("rbind", species_included)
if (file.exists("sequence.fasta"))
unlink("sequence.fasta")
if (file.exists("test.txt"))
unlink("test.txt")
}
setwd(mainDirect)
cat("\n Retrieving accession numbers \n")
filesna <-
list.files(path = subwd,
pattern = c(".csv"),
full.names = T)
filesna2 <- list.files(path = subwd, pattern = c(".csv"))
everyGi <- lapply(filesna, read.csv)
dat_acc2 <- list()
for (m in 1:length(filesna)) {
dat_acc <- ncbi_byid(gsub("gi", "", as.vector(everyGi[[m]]$gi)))
dat_acc2[[m]] <-
cbind.data.frame(taxa = dat_acc$taxon , dat_acc$acc_no)
}
merged.data.frame <-
Reduce(function(...)
merge(..., by = "taxa", all = TRUE), dat_acc2)
names(merged.data.frame) <- c("Species", filesna2)
write.csv(merged.data.frame,
paste0("Molecular_sampling_", lineage, ".csv"))
if (file.exists("sequence.fasta"))
unlink("sequence.fasta")
if (file.exists("test.txt"))
unlink("test.txt")
if (file.exists("Test.csv"))
unlink("Test.csv")
##Checking taxonomy
if (correct_db == T) {
speciesbdb <-
read.csv(paste0("Molecular_sampling_", lineage, ".csv"))
del <- c()
for (i in 1:dim(speciesbdb)[1]) {
del[i] <-
if (gnr_resolve(names = as.character(speciesbdb[i, 2]))[1, "score"] < 0.98)
"delete"
else
"Ok"
cat("Checking species", i, "of", dim(speciesbdb)[1], "\n")
}
sp_names <- strsplit(as.character(speciesbdb[, 2]), " ")
del2 <- c()
for (i in 1:length(sp_names)) {
del2[i] <- if (length(sp_names[[i]]) > 2)
"Delete"
else
"Ok"
}
spp_delete <-as.character(speciesbdb[, 2])[which(del == "Delete" |
del2 == "Delete")]
##Correct DB
corrected.db <- speciesbdb[!speciesbdb$Species %in% spp_delete, ]
##Correct clusters
for (i in 1:length(files)) {
cluster <- read.dna(paste0(subwd, files[i]), format = "fasta")
if(length(which(names(cluster) %in% gsub(" ", "_", spp_delete))) == 0){
cluster}else{
rep_la<-which(names(cluster) %in% gsub(" ",
"_", spp_delete))
cluster<-cluster[-(rep_la)]
}
setwd(file.path(mainDir, "unaligned"))
if (length(names(cluster)) == 0) {
write.table(c("Cluster", i, "skipped"),
paste0("No_cluster_", files[i]))
next} else{
dp<- which(duplicated(names(cluster)))
if(length(dp)==0){cluster}else{
cluster<-cluster[-(dp)]
}
write.dna(cluster, paste0("corrected_", files[i]), format = "fasta")
}
}
} else{
cat("Done!")
}
setwd(mainDir)
write.csv(df, "included.species.csv")
write.csv(corrected.db, "phylota.sampling.csv")
if (file.exists(grep("Molecular_*", list.files("."), value = T)))
unlink(grep("Molecular_*", list.files("."), value = T))
if (delete_all == T) {
do.call(file.remove, list(setdiff(
list(list.files(subwd, full.names = TRUE))[[1]], grep("corrected" , list(list.files(
subwd, full.names = TRUE
))[[1]], value = T)
)))
} else{
cat("Done")
}
##
cat("Creating a summary file of your molecular sampling")
db1 <- read.csv("included.species.csv")
db2 <- read.csv("phylota.sampling.csv")
db3 <- db2[, -c(1:2)]
for (i in 1:dim(db1)[1]) {
a <-
unlist(regmatches(
as.character(db1[i, "Cluster"]),
gregexpr("[[:digit:]]+", as.character(db1[i, "Cluster"]))
))
b <-
unlist(regmatches(names(db3)[-c(1)], gregexpr("[[:digit:]]+", names(db3)[-c(1)])))
to_add <- NULL
to_add <- rep(NA, dim(db3)[2])
to_add[1] <- as.character(db1$Included_species)[i]
to_add[which(a == b) + 1] <- as.character(db1$AN)[i]
db3 <- data.frame(rbind(as.matrix(db3), t(as.matrix(to_add))))
print(i)
}
db3 <- db3[order(db3$Species),]
db3<- db3[!duplicated(db3$Species),]
db3<-melt(db3,id='Species',na.rm=TRUE)
db3<-dcast(db3,Species~as.character(variable),value.var='value')
write.csv(db3, paste0("Molecular_sampling_", lineage, ".csv"))
#if (file.exists(fn3)) unlink(fn3,recursive =T)
#if (file.exists(fn4)) unlink(fn4,recursive =T)
cat("Please go over the molecular sampling file carefully")
##Do MSA--------
##Read fasta files
if(MSA==TRUE){
cat("\nalignment in process. Please be patient\n")
setwd(file.path(mainDir, "Unaligned"))
temp = list.files(pattern="*.fasta")
##Chose method: "Muscle", "ClustalOmega", "ClustalW"
align_PHYLOTA<-function(file.names){
mySequences <- readDNAStringSet(file.names)
myFirstAlignment <- msa(mySequences, "ClustalOmega")
aln <- as.DNAbin(msaConvert(myFirstAlignment, type="phangorn::phyDat"))
}
alignments<-pblapply(temp,align_PHYLOTA)
options(warn=-1)
dir.create(file.path(mainDir, "Aligned"))
options(warn=-0)
setwd(file.path(mainDir, "Aligned"))
for (i in 1: length(alignments)){
write.dna(alignments[[i]], paste0(clade, "_", "Alignment","Cluster" ,i, ".fasta"), format="fasta")
}
cat("\nAligned and unaligned sequences are in separated folders within your working directory")
if (ALI==TRUE){
####For Aliscore------
setwd(mainDir)
if( "Aliscore_v.2.0" %in% list.files() == FALSE){
download.file("http://software.zfmk.de/ALISCORE_v2.0.zip",'Aliscore_v.2.0.zip')
unzip("Aliscore_v.2.0.zip")
unzip("ALISCORE_v2.0/Aliscore_v.2.0.zip")
} else
options(warn=-1)
dir.create(file.path(mainDir, "Aliscore"))
options(warn=0)
setwd(file.path(mainDir, "Aliscore"))
plot.progress <- function(percent) {
plot(c(0,100), c(0,1), type='n', xlab='', ylab='', yaxt='n')
rect(0, 0.1, percent*100, 0.9, col='blue')
title(paste('Progress: ', round(percent*100,2), '%', sep=''))
}
aln_aliscored<-list()
for (i in 1: length(alignments)){
aln_aliscored[[i]]<-aliscore(alignments[[i]], gaps = "ambiguous", w = 3, path = paste0(mainDir, "/Aliscore_v.2.0"))
write.dna(aln_aliscored[[i]], paste0(clade, "_", "AliScore", "_Alignment","_Cluster_" ,i, ".fasta"), format="fasta")
plot.progress(i/length(alignments))
cat("\nCurated alignments are under ALISCORE folder")
}
setwd(mainDirect)
} else {}
setwd(mainDirect)
} else {}
setwd(mainDir)
setwd(mainDirect)
####Remove subspecies for the last time
dfs <-
lapply(list.files(
pattern = c(".csv"),
full.names = T
), read.csv)
files_names<-list.files(
pattern = c(".csv"),
full.names = T
)
dfs_co<-lapply(1:length(dfs), function(x){
vec<- gsub(" ", "_",as.character(dfs[[x]][,grep("species",colnames(dfs[[x]]),ignore.case=TRUE)]))
num1<-which(sapply(strsplit(vec, "_"), length)==3)
df_corr<- if(length(num1)==0){ dfs[[x]]}else{dfs[[x]][-num1,]}
return(df_corr)
})
lapply(1:length(dfs_co), function(x){
write.csv(dfs_co[[x]], file=files_names[x])
})
alns <-
lapply(list.files(
pattern = c(".fasta"),
full.names = T, recursive = T
), read.dna, "fasta")
names_alns <-list.files(
pattern = c(".fasta"),
full.names = T, recursive = T
)
lapply(1:length(alns), function(x){
aln_ex<-alns[[x]]
num1<-which(sapply(strsplit(labels(aln_ex), "_"), length)==3)
if(length(num1)==0){ }else{
newaln<- if(is.null(dim(aln_ex)) ==F){
aln_ex[-(num1),]
}else{
aln_ex[-(num1)]
}
write.dna(newaln, names_alns[x], format = "fasta")
}
})
####Re organize datasets
dfs <-
lapply(list.files(
pattern = c(".csv"),
full.names = T
), read.csv)
files_names<-list.files(
pattern = c(".csv"),
full.names = T
)
species_correct<- unique(gsub(" ", "_",as.character(dfs[[2]][,grep("species",colnames(dfs[[x]]),ignore.case=TRUE)])))
dfs_co<-lapply(1:length(dfs), function(x){
spps<- gsub(" ", "_",as.character(dfs[[x]][,grep("species",colnames(dfs[[x]]),ignore.case=TRUE)]))
new_df<-dfs[[x]][!duplicated(spps),]
spps<- gsub(" ", "_",as.character(new_df[,grep("species",colnames(new_df),ignore.case=TRUE)]))
new_df<-new_df[which(spps %in% species_correct),]
return(new_df)
})
lapply(1:length(dfs_co), function(x){
write.csv(dfs_co[[x]], file=files_names[x])
})
alns <-
lapply(list.files(
pattern = c(".fasta"),
full.names = T, recursive = T
), read.dna, "fasta")
names_alns <-list.files(
pattern = c(".fasta"),
full.names = T, recursive = T
)
lapply(1:length(alns), function(x){
aln_ex<-alns[[x]]
num1<-which(sapply(strsplit(labels(aln_ex), "_"), length)==3)
num2<-which(! labels(aln_ex) %in% species_correct)
num3<-c(num1,num2)
if(length(num3)==0){ }else{
newaln<- if(is.null(dim(aln_ex)) ==F){
aln_ex[-(num3),]
}else{
aln_ex[-(num3)]
}
write.dna(newaln, names_alns[x], format = "fasta")
}
})
cat("\nDone!!")
options(warn = 0)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.