rm(list=ls())
args = commandArgs(trailingOnly = T)
source("R/seq_designation_nolength.R")
source("R/node_info_nolength.R")
source("R/lineage_info.R")
library(dplyr)
library(ggtree)
library(ape)
library(phytools)
library(phangorn)
install.packages('castor', repos = "http://cran.us.r-project.org")
library(castor)
library(ips)
tree<-ape::read.tree(paste(args, "/Trees/", args, "_combined_aligned.fasta.contree", sep = ""))
ancestral<-seqinr::read.alignment(paste(args, "/Timetree/ancestral_sequences.fasta", sep = ""), format = "fasta")
metadata<-read.csv(paste(args, "/", args, "_combined_metadata.csv", sep = ""))
metadata$year[which(is.na(metadata$year))]<-"-"
alignment<-seqinr::read.alignment(paste(args, "/Alignment/", args, "_combined_aligned.fasta", sep = ""), format = "fasta")
all_lineage<-read.csv("inst/extdata/References/RABV/lineage_info.csv")
node_data<-node_info(tree, 90, alignment, metadata, ancestral)
seq_data<-seq_designation(tree, 90, alignment, metadata, ancestral)
names(node_data)[5]<-"number"
for (i in 1:length(node_data$lineage)) {
seq_data$lineage[which(seq_data$lineage == node_data$lineage[i])]<-node_data$number[i]
}
lineage_info<-lineage_info(seq_data, metadata)
for (i in 1:length(node_data$lineage)) {
lineage_info$lineage[which(lineage_info$lineage == node_data$lineage[i])]<-node_data$number[i]
}
assignments<-read.csv(paste(args, "/Assignment/assignment.csv", sep = ""))
clades<-data.frame(clade=c("Africa", "Asian", "Arctic", "Bat", "Cosmopolitan", "Indian", "RAC"), present=NA)
assignment_names<-unique(assignments$lineage)
for (i in 1:length(clades$clade)) {
if (length(grep(clades$clade[i], assignment_names)) != 0) {
clades$present[i]<-"Y"
}
}
numbers<-which(clades$present == "Y")
sequences1<-NA
sequences2<-NA
sequences3<-NA
sequences4<-NA
sequences5<-NA
sequences6<-NA
sequences7<-NA
if (1 %in% numbers) {
sequences1<-read.csv("Datasets/Africa_N/Africa_N_sequence_data.csv")
}
if (2 %in% numbers) {
sequences2<-read.csv("Datasets/Asian_N/Asian_N_sequence_data.csv")
}
if (3 %in% numbers) {
sequences3<-read.csv("Datasets/Arctic_N/Arctic_N_sequence_data.csv")
}
if (4 %in% numbers) {
sequences4<-read.csv("Datasets/Bat_N/Bat_N_sequence_data.csv")
}
if (5 %in% numbers) {
sequences5<-read.csv("Datasets/Cosmo_N/Cosmo_N_sequence_data.csv")
}
if (6 %in% numbers) {
sequences6<-read.csv("Datasets/Indian_N/Indian_N_sequence_data.csv")
}
if (7 %in% numbers) {
sequences7<-read.csv("Datasets/RAC-SK_N/RAC-SK_N_sequence_data.csv")
}
sequences<-rbind(sequences1, sequences2, sequences3, sequences4, sequences5, sequences6, sequences7)
if(length(which(is.na(sequences$ID))) != 0) {
sequences<-sequences[-c(which(is.na(sequences$ID))),]
}
sequences<-sequences[which(sequences$ID %in% metadata$ID),]
numbers<-which(clades$present == "Y")
current<-data.frame(lineage=unique(assignments$lineage), node=NA)
if (length(which(is.na(current$lineage))) != 0) {
current<-current[-c(which(is.na(current$lineage))),]
}
for (i in 1:length(current$lineage)) {
if( length(ape::getMRCA(tree, tip = c(sequences$ID[which(sequences$cluster == current$lineage[i])]))) != 0) {
current$node[i]<-ape::getMRCA(tree, tip = c(sequences$ID[which(sequences$cluster == current$lineage[i])]))
}
}
if(length(which(is.na(current$node))) != 0){
current<-current[-c(which(is.na(current$node))),]
}
if(length(which(duplicated(current$node))) != 0) {
current<-current[-c(which(duplicated(current$node))),]
}
if (length(which(node_data$node %in% current$node)) != 0) {
node_data<-node_data[-c(which(node_data$node %in% current$node)),]
}
updates<-data.frame(lineage = current$lineage, count = NA, node = NA, old=NA)
for (i in 1:length(updates$lineage)) {
updates$count[i]<-length(which(assignments$lineage == updates$lineage[i]))
}
if (length(which(updates$count >=10))!= 0) {
updates<-updates[which(updates$count >= 10),]
for (i in 1:length(updates$lineage)) {
updates$node[i]<-ape::getMRCA(tree, tip = c(assignments$ID[which(assignments$lineage == updates$lineage[i])]))
}
if (length(which(duplicated(updates$node))) != 0) {
updates<-updates[-c(which(duplicated(updates$node))),]
}
updates$viable<-NA
for (i in 1:length(updates$lineage)) {
updates$viable[i]<-
length(which(node_data$node %in% phangorn::Descendants(tree, updates$node[i], type = "all"))) -
(length(which(updates$node %in% phangorn::Descendants(tree, updates$node[i], type = "all"))) +
length(which(current$node[-c(which(current$node %in% node_data$node))]
%in% phangorn::Descendants(tree, updates$node[i], type = "all")))
)
}
if(length(which(updates$viable <= 0)) != 0) {
updates<-updates[-c(which(updates$viable <= 0)),]
}
if (length(updates$lineage) != 0) {
existing<-updates
updates<-updates[order(updates$viable),]
test<-data.frame(lineage = NA, count = NA, node = NA, viable = NA)
numbers<-which(updates$viable > 0)
x<-1
while (length(numbers) != 0 && x < 100) {
test<-data.frame(lineage = NA, count = NA, node = NA, viable = NA, old = NA)
for (i in 1:length(numbers)) {
test$node<-node_data$node[which(node_data$node %in% phangorn::Descendants(tree, updates$node[numbers[i]], type = "all"))][1]
test$lineage<-paste(updates$lineage[numbers[i]], ".1", sep = "")
updates<-rbind(updates, test)
}
for (i in 1:length(updates$lineage)) {
updates$viable[i]<-
length(which(node_data$node %in% phangorn::Descendants(tree, updates$node[i], type = "all"))) -
(length(which(updates$node %in% phangorn::Descendants(tree, updates$node[i], type = "all"))) +
length(which(current$node[-c(which(current$node %in% node_data$node))]
%in% phangorn::Descendants(tree, updates$node[i], type = "all")))
)
}
if (length(which(duplicated(updates$node))) != 0) {
updates<-updates[-c(which(duplicated(updates$node))),]
}
int<-updates[-c(which(updates$lineage %in% existing$lineage)),]
int<-rbind(int, updates[which(duplicated(updates$lineage)),])
problem_names<-data.frame(letters = c("A1", "B1", "C1", "D1", "E1", "F1", "G1", "H1", "I1", "J1", "K1", "L1", "M1", "N1",
"O1", "P1", "Q1", "R1", "S1", "T1", "U1", "V1", "W1", "X1", "Y1", "Z1"))
numbers<-grep("\\..\\..\\..", int$lineage)
for (i in 1:length(numbers)) {
updates$old[which(updates$lineage == int$lineage[numbers[i]])]<-int$lineage[numbers[i]]
}
int$update<-int$lineage
int$update<-stringr::str_replace(int$update, "A1\\..\\..\\..", "B1")
int$update<-stringr::str_replace(int$update, "B1\\..\\..\\..", "C1")
int$update<-stringr::str_replace(int$update, "C1\\..\\..\\..", "D1")
int$update<-stringr::str_replace(int$update, "D1\\..\\..\\..", "E1")
int$update<-stringr::str_replace(int$update, "E1\\..\\..\\..", "F1")
int$update<-stringr::str_replace(int$update, "F1\\..\\..\\..", "G1")
int$update<-stringr::str_replace(int$update, "G1\\..\\..\\..", "H1")
int$update<-stringr::str_replace(int$update, "H1\\..\\..\\..", "I1")
int$update<-stringr::str_replace(int$update, "I1\\..\\..\\..", "J1")
int$update<-stringr::str_replace(int$update, "J1\\..\\..\\..", "K1")
int$update<-stringr::str_replace(int$update, "K1\\..\\..\\..", "L1")
int$update<-stringr::str_replace(int$update, "L1\\..\\..\\..", "M1")
int$update<-stringr::str_replace(int$update, "M1\\..\\..\\..", "N1")
duplicates<-int$update[which(duplicated(int$update))]
duplicates<-c(duplicates, int$update[which(int$update %in% c(all_lineage$lineage, existing$lineage))])
if(length(duplicates) != 0) {
problems<-duplicates[which(stringr::str_count(duplicates, pattern = "\\.") == 0)]
duplicates<-duplicates[which(stringr::str_count(duplicates, pattern = "\\.") != 0)]
} else {
problems<-{}
}
while (length(duplicates) != 0) {
for (i in 1:length(duplicates)) {
test<-which(int$update == duplicates[i])
v<-1
for (j in 1:length(test)) {
name<-unlist(stringr::str_split(int$update[test[j]], "\\."))
name[length(name)]<-v+as.integer(name[length(name)])
v<-(v+1)
int$update[test[j]]<-paste(c(name), collapse='.' )
}
duplicates<-int$update[which(duplicated(int$update))]
duplicates<-c(duplicates, int$update[which(int$update %in% all_lineage$lineage)])
if(length(duplicates) != 0) {
duplicates<-duplicates[which(stringr::str_count(duplicates, pattern = "\\.") != 0)]
}
}
}
while (length(problems) != 0) {
for (i in 1:length(problems)) {
test<-which(int$update == problems[i])
if(length(strsplit(problems[i], "_")[[1]]) != 1){
subclade<-strsplit(problems[i], "_")[[1]][1]
lineage<-strsplit(problems[i], "_")[[1]][2]
lineage<-problem_names$letters[(which(problem_names$letters == lineage))+1]
int$update[test[1]]<-paste(subclade, lineage, sep = "_")
} else {
int$update[test[1]]<-problem_names$letters[(which(problem_names$letters == problems[i]))+1]
}
}
duplicates<-int$update[which(duplicated(int$update))]
duplicates<-c(duplicates, int$update[which(int$update %in% all_lineage$lineage)])
if(length(duplicates) != 0 ){
problems<-duplicates[which(stringr::str_count(duplicates, pattern = "\\.") == 0)]
} else {
problems<-{}
}
}
for (i in 1:length(int$lineage)) {
updates$lineage[which(updates$node == int$node[i])]<-int$update[i]
}
for (i in 1:length(updates$lineage)) {
updates$viable[i]<-
length(which(node_data$node %in% phangorn::Descendants(tree, updates$node[i], type = "all"))) -
(length(which(updates$node %in% phangorn::Descendants(tree, updates$node[i], type = "all"))) +
length(which(current$node[-c(which(current$node %in% node_data$node))]
%in% phangorn::Descendants(tree, updates$node[i], type = "all")))
)
}
numbers<-which(updates$viable > 0)
x<-x+1
}
updates<-updates[-c(which(updates$lineage %in% sequences$cluster)),]
node_updates<-data.frame()
node_data$lineage<-NA
for (i in 1:length(updates$lineage)) {
node_updates<-rbind(node_updates, node_data[which(node_data$node == updates$node[i]),])
node_updates$lineage[i]<-updates$lineage[i]
}
for (i in 1:length(node_updates$node)) {
seq_data$lineage[which(seq_data$lineage == node_updates$number[i])]<-node_updates$lineage[i]
}
for (i in 1:length(node_updates$node)) {
lineage_info$lineage[which(lineage_info$lineage == node_updates$number[i])]<-node_updates$lineage[i]
}
if (length(which(lineage_info$lineage %in% 1:1000)) != 0) {
lineage_info<-lineage_info[-c(which(lineage_info$lineage %in% 1:1000)),]
}
numbers<-which(assignments$ID %in% seq_data$ID[which(seq_data$lineage %in% 1:1000)])
for (i in 1:length(numbers)) {
seq_data$lineage[which(seq_data$ID == assignments$ID[numbers[i]])]<-assignments$lineage[numbers[i]]
}
if (length(which(is.na(seq_data$lineage))) != 0) {
seq_data<-seq_data[-c(which(is.na(seq_data$lineage))),]
}
if (length(which(seq_data$lineage %in% 1:1000)) != 0) {
seq_data<-seq_data[-c(which(seq_data$lineage %in% 1:1000)),]
}
new_seq<-seq_data[which(seq_data$ID %in% assignments$ID),]
all_lineage<-read.csv("inst/extdata/References/RABV/lineage_info.csv")
all_lineage<-all_lineage[which(all_lineage$lineage %in% assignments$lineage),]
lineage_info$parent<-NA
numbers<-grep("\\.", lineage_info$lineage)
if (length(grep("\\.", lineage_info$lineage))!=0){
for (i in 1:length(numbers)) {
parent<-strsplit(lineage_info$lineage[grep("\\.", lineage_info$lineage)], "\\.")[[i]]
parent<-parent[1:length(parent)-1]
if (length(parent) == 1) {
lineage_info$parent[numbers[i]]<-parent
} else {
parent<-paste(parent[1], parent[2], sep = ".")
lineage_info$parent[numbers[i]]<-parent
}
}
}
if(length(which(is.na(lineage_info$parent))) != 0) {
numbers<-which(is.na(lineage_info$parent))
for (i in 1:length(numbers)) {
x<-which(updates$lineage == lineage_info$lineage[numbers[i]])
lineage_info$parent[numbers[i]]<-
paste(strsplit(updates$old[x], "\\.")[[1]][1], strsplit(updates$old[x], "\\.")[[1]][2],
strsplit(updates$old[x], "\\.")[[1]][3], sep = ".")
}
}
}
total_lineages<-read.csv("inst/extdata/References/RABV/lineage_info.csv")
"%notin%"<-Negate("%in%")
issues<-unique(all_lineage$parent[which(all_lineage$parent %notin% all_lineage$lineage)])
big_names<-c("RABV")
if (length(which(issues %in% big_names)) != 0 ) {
issues<-issues[-c(which(issues %in% big_names))]
}
while (length(issues) != 0) {
all_lineage<-rbind(all_lineage, total_lineages[which(total_lineages$lineage %in% issues),])
issues<-unique(all_lineage$parent[which(all_lineage$parent %notin% all_lineage$lineage)])
if (length(which(issues %in% big_names)) != 0 ) {
issues<-issues[-c(which(issues %in% big_names))]
}
}
for (i in 1:length(all_lineage$lineage)) {
all_lineage$n_seqs[i]<-length(which(new_seq$lineage == all_lineage$lineage[i]))
}
for (i in 1:length(lineage_info$lineage)) {
lineage_info$n_seqs[i]<-length(which(new_seq$lineage == lineage_info$lineage[i]))
lineage_info$year_first[i]<-min(new_seq$year[which(new_seq$lineage == lineage_info$lineage[i])])
lineage_info$year_last[i]<-max(new_seq$year[which(new_seq$lineage == lineage_info$lineage[i])])
}
lineage_info<-lineage_info[rev(order(lineage_info$lineage)),]
numbers<-which(lineage_info$n_seqs < 10 & lineage_info$lineage %notin% lineage_info$parent)
while (length(numbers) != 0) {
for (x in 1:length(numbers)) {
new_seq$lineage[which(new_seq$lineage == lineage_info$lineage[numbers[x]])]<-lineage_info$parent[numbers[x]]
countries<-unique(lineage_info$country[numbers[x]],
lineage_info$country[which(lineage_info$lineage == lineage_info$parent[numbers[x]])])
lineage_info$country[which(lineage_info$lineage == lineage_info$parent[numbers[x]])]<-countries
lineage_info<-lineage_info[-c(numbers),]
for (i in 1:length(lineage_info$lineage)) {
lineage_info$n_seqs[i]<-length(which(new_seq$lineage == lineage_info$lineage[i]))
lineage_info$year_first[i]<-min(new_seq$year[which(new_seq$lineage == lineage_info$lineage[i])])
lineage_info$year_last[i]<-max(new_seq$year[which(new_seq$lineage == lineage_info$lineage[i])])
}
}
numbers<-which(lineage_info$n_seqs < 10 & lineage_info$lineage %notin% lineage_info$parent)
}
write.csv(lineage_info, paste(args, "/Outputs/new_lineages.csv", sep = ""), row.names = F)
write.csv(all_lineage, paste(args, "/Outputs/relevant_lineages.csv", sep = ""), row.names = F)
write.csv(new_seq, paste(args, "/Outputs/sequence_data.csv", sep = ""), row.names = F)
total_lineages<-read.csv("inst/extdata/References/RABV/lineage_info.csv")
"%notin%"<-Negate("%in%")
issues<-unique(all_lineage$parent[which(all_lineage$parent %notin% all_lineage$lineage)])
big_names<-c("RABV")
if (length(which(issues %in% big_names)) != 0 ) {
issues<-issues[-c(which(issues %in% big_names))]
}
while (length(issues) != 0) {
all_lineage<-rbind(all_lineage, total_lineages[which(total_lineages$lineage %in% issues),])
issues<-unique(all_lineage$parent[which(all_lineage$parent %notin% all_lineage$lineage)])
if (length(which(issues %in% big_names)) != 0 ) {
issues<-issues[-c(which(issues %in% big_names))]
}
}
lineages<-data.frame(lineage = c(all_lineage$lineage, lineage_info$lineage),
parent = c(all_lineage$parent, lineage_info$parent),
n_seqs = c(all_lineage$n_seqs, lineage_info$n_seqs))
lineage_info<-lineages
if(length(which(duplicated(lineage_info))) != 0) {
lineage_info<-lineage_info[-c(which(duplicated(lineage_info))),]
}
lineage_info$colour<-NA
Colours<-c("Reds","Purples","YlOrBr","PuBuGn","YlOrRd","OrRd","PuBu","Pastel1","Greens","Greys",
"GnBu","BuGn","RdPu","Oranges","BuPu","YlGn","PuRd","YlGnBu")
lineages<-data.frame(lineage = lineage_info$lineage, subclade = NA)
for (i in 1:length(lineages$lineage)) {
lineages$subclade[i]<-strsplit(lineages$lineage[i], "_")[[1]][1]
}
letters <- c("A1", "B1", "C1", "D1", "E1", "F1", "G1", "H1", "I1", "J1", "K1", "L1", "M1", "N1",
"O1", "P1", "Q1", "R1", "S1", "T1", "U1", "V1", "W1", "X1", "Y1", "Z1")
if(length(grep("_", lineage_info$lineage)) != 0) {
if (length(which(lineages$subclade %in% letters)) != 0) {
lineages<-lineages[-c(which(lineages$subclade %in% letters)),]
}
}
clades<-unique(lineages$subclade)
if(length(grep("\\.", clades)) != 0 ) {
clades<-clades[-c(grep("\\.", clades))]
}
lineage<-lineage_info$lineage[-c(grep("_", lineage_info$lineage))]
cols<-RColorBrewer::brewer.pal(9, "Blues")
pal<-colorRampPalette(c(cols))
pal<-rev(pal(length(lineage)))
lineage_info$colour[-c(grep("_", lineage_info$lineage))]<-pal
for (i in 1:length(clades)) {
lineage<-grep(clades[i], lineage_info$lineage)
cols<-RColorBrewer::brewer.pal(3, Colours[i])
pal<-colorRampPalette(c(cols))
pal<-rev(pal(length(lineage)))
lineage_info$colour[(grep(clades[i], lineage_info$lineage))]<-pal
}
new<-plotly::plot_ly(
labels = c(lineage_info$lineage),
parents = c(lineage_info$parent),
values = c(lineage_info$n_seqs),
type = "sunburst",
marker = list(colors = (lineage_info$colour))
)
htmlwidgets::saveWidget(plotly::as_widget(new), (paste(args, "/Figures/", args, "_sunburst.html", sep = "")))
sequences<-sequences[which(sequences$ID %in% tree$tip.label),]
sequence_data<-data.frame(ID = c(sequences$ID, new_seq$ID), lineage = c(sequences$cluster, new_seq$lineage))
for (i in 1:length(sequence_data$ID)) {
if (sequence_data$ID[i] %in% assignments$ID) {
sequence_data$new[i]<-"Y"
} else {
sequence_data$new[i]<-"N"
}
}
lineage_info<-lineage_info[order(lineage_info$lineage),]
if (length(which(lineage_info$lineage %notin% sequence_data$lineage)) != 0){
lineage_info<-lineage_info[-c(which(lineage_info$lineage %notin% sequence_data$lineage)),]
}
add<-unique(sequence_data$lineage[which(sequence_data$lineage %notin% lineage_info$lineage)])
if (length(add) != 0) {
extras<-data.frame(lineage = add, parent = NA, n_seqs = NA, colour = NA)
for (i in 1:length(extras$lineage)) {
find<-strsplit(extras$lineage[i], "_")[[1]][1]
col_find<-lineage_info$colour[which(grepl(find, lineage_info$lineage))[1]]
sq<-i**2
sqsub<-substr(sq,1,1)
add_col<-paste(substr(col_find, 1, 4), sqsub, sqsub, sqsub, sep = "")
extras$colour[i]<-add_col
extras$colour[which(grepl("NA", extras$colour))]<-"#d6d4d4"
}
lineage_info<-rbind(lineage_info, extras)
}
lineage_info<-lineage_info[order(lineage_info$lineage),]
plot_tree<-ggtree::ggtree(tree, colour = "grey50", ladderize = T) %<+% sequence_data +
ggtree::geom_tippoint(colour = "grey50", size=4) +
ggtree::geom_tippoint(ggplot2::aes(color=lineage), size=3) +
ggtree::theme(plot.title = ggplot2::element_text(size = 40, face = "bold"))+
ggtree::scale_color_manual(values=c(lineage_info$colour)) +
ggtree::theme(legend.position = "none")
genotype<-data.frame(lineage = sequence_data$lineage)
rownames(genotype)<-sequence_data$ID
plot_tree<-ggtree::gheatmap(plot_tree, genotype, offset=0.01, width=.1, font.size=3, color = NA,
colnames_angle=-45, hjust=0) +
ggtree::scale_fill_manual(values=c(lineage_info$colour), name="lineage")+
ggtree::theme(legend.position = "none")
plot_new<-ggtree::ggtree(tree, colour = "grey50", ladderize = T) %<+% sequence_data +
ggtree::geom_tippoint(ggplot2::aes(color=new), size=5) +
scale_color_manual(values = c("#808080", "red"))
ggplot2::ggsave(paste(args, "/Figures/", args, "_lineage_tree.png", sep = ""),
plot = gridExtra::arrangeGrob(plot_tree, plot_new, ncol = 2))
} else {
print("No new lineages. Relevent existing lineage information in Outputs/relevant_lineages.csv with individual sequence assignments in assignment file.")
all_lineage<-read.csv("inst/extdata/References/RABV/lineage_info.csv")
all_lineage<-all_lineage[which(all_lineage$lineage %in% assignments$lineage),]
write.csv(all_lineage, paste(args, "/Outputs/relevant_lineages.csv", sep = ""), row.names = F)
all_lineage<-read.csv("inst/extdata/References/RABV/lineage_info.csv")
all_lineage<-all_lineage[which(all_lineage$lineage %in% sequences$cluster),]
for (i in 1:length(all_lineage$lineage)) {
all_lineage$n_seqs[i]<-length(which(assignments$lineage == all_lineage$lineage[i]))
}
node_data<-current
sequence_data<-assignments
lineage_info<-all_lineage
lineage_info$colour<-NA
Colours<-c("Reds","Purples","YlOrBr","PuBuGn","YlOrRd","OrRd","PuBu","Pastel1","Greens","Greys",
"GnBu","BuGn","RdPu","Oranges","BuPu","YlGn","PuRd","YlGnBu")
lineages<-data.frame(lineage = lineage_info$lineage, subclade = NA)
for (i in 1:length(lineages$lineage)) {
lineages$subclade[i]<-strsplit(lineages$lineage[i], "_")[[1]][1]
}
letters <- c("A1", "B1", "C1", "D1", "E1", "F1", "G1", "H1", "I1", "J1", "K1", "L1", "M1", "N1",
"O1", "P1", "Q1", "R1", "S1", "T1", "U1", "V1", "W1", "X1", "Y1", "Z1")
if(length(grep("_", lineage_info$lineage)) != 0) {
if (length(which(lineages$subclade %in% letters)) != 0) {
lineages<-lineages[-c(which(lineages$subclade %in% letters)),]
}
}
clades<-unique(lineages$subclade)
if(length(grep("\\.", clades)) != 0 ) {
clades<-clades[-c(grep("\\.", clades))]
}
lineage<-lineage_info$lineage[-c(grep("_", lineage_info$lineage))]
cols<-RColorBrewer::brewer.pal(9, "Blues")
pal<-colorRampPalette(c(cols))
pal<-rev(pal(length(lineage)))
lineage_info$colour[-c(grep("_", lineage_info$lineage))]<-pal
for (i in 1:length(clades)) {
lineage<-grep(clades[i], lineage_info$lineage)
cols<-RColorBrewer::brewer.pal(3, Colours[i])
pal<-colorRampPalette(c(cols))
pal<-rev(pal(length(lineage)))
lineage_info$colour[(grep(clades[i], lineage_info$lineage))]<-pal
}
new<-plotly::plot_ly(
labels = c(lineage_info$lineage),
parents = c(lineage_info$parent),
values = c(lineage_info$n_seqs),
type = "sunburst",
marker = list(colors = (lineage_info$colour))
)
htmlwidgets::saveWidget(plotly::as_widget(new), (paste(args, "/Figures/", args, "_sunburst.html", sep = "")))
sequences<-sequences[which(sequences$ID %in% tree$tip.label),]
sequence_data<-data.frame(ID = c(sequences$ID, assignments$ID), lineage = c(sequences$cluster, assignments$lineage))
for (i in 1:length(sequence_data$ID)) {
if (sequence_data$ID[i] %in% assignments$ID) {
sequence_data$new[i]<-"Y"
} else {
sequence_data$new[i]<-"N"
}
}
lineage_info<-lineage_info[order(lineage_info$lineage),]
plot_tree<-ggtree::ggtree(tree, colour = "grey50", ladderize = T) %<+% sequence_data +
ggtree::geom_tippoint(colour = "grey50", size=4) +
ggtree::geom_tippoint(ggplot2::aes(color=lineage), size=3) +
ggtree::theme(plot.title = ggplot2::element_text(size = 40, face = "bold"))+
ggtree::scale_color_manual(values=c(lineage_info$colour)) +
ggtree::theme(legend.position = "none")
genotype<-data.frame(lineage = sequence_data$lineage)
rownames(genotype)<-sequence_data$ID
plot_tree<-ggtree::gheatmap(plot_tree, genotype, offset=0.01, width=.1, font.size=3, color = NA,
colnames_angle=-45, hjust=0) +
ggtree::scale_fill_manual(values=c(lineage_info$colour), name="lineage")+
ggtree::theme(legend.position = "none")
plot_new<-ggtree::ggtree(tree, colour = "grey50", ladderize = T) %<+% sequence_data +
ggtree::geom_tippoint(ggplot2::aes(color=new), size=5) +
scale_color_manual(values = c("#808080", "red"))
ggplot2::ggsave(paste(args, "/Figures/", args, "_lineage_tree.png", sep = ""),
plot = gridExtra::arrangeGrob(plot_tree, plot_new, ncol = 2))
}
#############################################
# BOOTSTRAP SUPPORT #
#############################################
alignment_matrix <- seqinr::as.matrix.alignment(alignment)
ancestral_matrix <- seqinr::as.matrix.alignment(ancestral)
tree$tip.label <- gsub("\\..*", "", tree$tip.label, perl = T)
tree$node.comment<- gsub(".*=", "", tree$node.label, perl = T)
nodes_70 <- which(tree$node.comment > 70 | tree$node.comment == 100)
nodes_70 <- nodes_70 + length(tree$tip.label)
node_data <- data.frame(Node = nodes_70, n_tips = NA)
# Make a dataframe ready for values to be put in
# Fill the first column with the numbers of the nodes identified in the previous steps
for(i in 1:length(nodes_70)) {
node_data[i,2] <- length(phangorn::Descendants(tree, nodes_70[i], type = "tips")[[1]])
}
# For each node identified in the previous step, count the number of tips descended from that node
nodes_5 <- node_data[(which(node_data[,2]%in%5:9)),]
#############################################
# PARENT LINEAGES #
#############################################
for (i in 1:length(all_lineage$lineage)) {
match<-which(lineage_info$lineage == all_lineage$lineage[i])
lineage_info$n_seqs[match]<-lineage_info$n_seqs[match] + all_lineage$n_seqs[i]
}
tests<-data.frame(lineage = lineage_info$lineage[which(lineage_info$n_seqs >= 5)], clades = NA)
sequence_data<-data.frame(ID = c(seq_data$ID, sequence_data$ID), lineage = c(seq_data$lineage, sequence_data$lineage))
sequence_data<-sequence_data[-c(which(duplicated(sequence_data$ID))),]
if(length(which(sequence_data$lineage %in% 1:1000)) != 0) {
sequence_data<-data.frame(ID = c(sequences$ID, assignments$ID),
lineage = c(sequences$cluster, assignments$lineage))
}
for (x in 1:length(tests$lineage)) {
MRCA<-getMRCA(tree, tip = (sequence_data$ID[which(sequence_data$lineage == tests$lineage[x])]))
if (length(MRCA)!= 0) {
descendents<-getDescendants(tree, node=MRCA)
nodes<-which(descendents %in% nodes_5$Node)
if(length(nodes) != 0){
for (i in 1:length(nodes)) {
clades<-unique(
sequence_data$lineage[
which(sequence_data$ID %in% tree$tip.label[getDescendants(tree, node=descendents[nodes[i]])])])
if (length(clades)==1) {
if (clades == tests$lineage[x]){
tests$clades[x]<-paste(tests$clades[x], descendents[nodes[i]], sep = ",")
}
}
}
}
}
}
#' Which of the potential lineage defining nodes are descended from each lineage
if(length(which(is.na(tests$clades))) != 0){
tests<-tests[-c(which(is.na(tests$clades))),]
}
#' Remove any lineages with no potential lineage nodes descended
tests$clades<-gsub("NA,", "", tests$clades)
tests$n_clades<-NA
for (i in 1:length(tests$lineage)) {
tests$n_clades[i]<-length((strsplit(tests$clades, ","))[[i]])
}
#' Count number of potential lineage nodes for each lineage
#' Make a table of each of the nodes to investigate and which lineage they're from:
noi<-tests$clades[1]
for (i in 2:length(tests$lineage)) {
noi<-paste(noi, tests$clades[i], sep = ",")
}
noi<-strsplit(noi, ",")[[1]]
noi<-data.frame(node = noi, lineage = NA, tips = NA)
if(length(grep("NA", noi$node)) != 0) {
noi<-noi[-c(grep("NA", noi$node)),]
}
for (i in 1:length(noi$node)) {
noi$lineage[i]<-tests$lineage[grep(noi$node[i], tests$clades)]
}
noi$node<-as.integer(noi$node)
for (i in 1:length(noi$node)) {
noi$tips[i]<-length(Descendants(tree, noi$node[i], "tips")[[1]])
}
#' Count the number of tips for each node of interest
noi$tips<-as.integer(noi$tips)
#' Count the number of nodes of interest for each lineage:
int<-data.frame(lineage = unique(noi$lineage), count = NA)
for (i in 1:length(int$lineage)) {
int$count[i]<-length(which(noi$lineage == int$lineage[i]))
}
int$test<-NA
numbers<-which(int$count == 1)
if (length(numbers) != 0){
for (i in 1:length(numbers)) {
if ((lineage_info$n_seqs[which(lineage_info$lineage == int$lineage[numbers[i]])] -
noi$tips[which(noi$lineage == int$lineage[numbers[i]])]) == 0) {
int$test[numbers[i]]<-"N"
}
}
}
#' Check that this isn't just a ladder structure from the parent lineage
#############################################
# 95% COVERAGE WGS #
#############################################
noi$diff <- NA
nodes_reduced <- data.frame(Nodes = (noi$node - (1+length(tree$tip.label))))
#############################################
# DIFFERENCE FROM ANCESTOR #
#############################################
# For each node of interest, find all the tips
# Make a note of the differences between the oldest seq in the each cluster/lineage and one of the seqs in the lineage
# Which differences between the old seq and each seq are shared between all the seqs in the lineage
# E.g. which lineages show one or more shared nucleotides differences from the ancestor
# Count these differences and add them to the table to be analysed further (may just be n's)
for (i in 1:length(noi$node)) {
cm <- caper::clade.members(noi$node[i], tree, include.nodes = F, tip.labels = T)
seq_cm <- which(sequence_data$ID %in% cm)
old <- which(row.names(ancestral_matrix) == paste("NODE_", (sprintf("%07d", nodes_reduced$Nodes[i])), sep=""))
tips <- which(row.names(ancestral_matrix) %in% cm)
x <- which(ancestral_matrix[old,] != ancestral_matrix[(tips[1]),])
for (j in tips[-c(1)]) {
x <- x[which(x %in% (which(ancestral_matrix[old,] != ancestral_matrix[j,])))]
print(x)
noi$diff[i] <- length(x)
}
}
nodes_diff <- noi[(which(noi$diff!=0)),] # Get rid of the ones with no differences straight away
#############################################
# OVERLAPPING TIPS REMOVAL #
#############################################
# Add a column to nodes_diff and for each node, count how many of the other nodes of interest are descended from it
nodes_diff$overlaps <- NA
for (i in 1:length(nodes_diff$node)) {
nodes_diff$overlaps[i] <- length(which((phangorn::allDescendants(tree)[[(nodes_diff[i,1])]]) %in% nodes_diff[,1]))
}
nodes_diff<-nodes_diff[(which(nodes_diff$overlaps == 0)),]
#############################################
# SIGNIFICANT DIVERSITY #
#############################################
nodes_diff$distance<-NA
for (i in 1:length(nodes_diff$node)) {
parent<-Ancestors(tree, nodes_diff$node[i], "parent")
nodes_diff$distance[i]<-castor::get_pairwise_distances(tree, nodes_diff$node[1], parent)
}
#' Calculate the patristic distance of each node of interest from it's parent node
distances<-as.matrix(adephylo::distTips(tree, tips = "all", method = "patristic"))
#' Get the patristic distance matrix for all sequences
nodes_diff$top<-NA
for (i in 1:length(nodes_diff$node)) {
subset<-distances[which(rownames(distances)
%in% sequence_data$ID[which(sequence_data$lineage == nodes_diff$lineage[i])]),
which(colnames(distances)
%in% sequence_data$ID[which(sequence_data$lineage == nodes_diff$lineage[i])])]
nodes_diff$top[i]<-quantile(subset, 0.95)
}
#' Calculate the 95th percentile for all patristic distances in each lineage
nodes_diff<-nodes_diff[which(nodes_diff$distance >= nodes_diff$top),]
#' Only keep nodes where the distance to the parent node is at least the 95th percentile
if(length(nodes_diff$node) != 0) {
sequence_data$Country<-NA
for (i in 1:length(sequence_data$ID)) {
sequence_data$Country[i]<-metadata$country[which(metadata$ID == sequence_data$ID[i])]
}
sequence_data$Year<-NA
for (i in 1:length(sequence_data$ID)) {
sequence_data$Year[i]<-metadata$year[which(metadata$ID == sequence_data$ID[i])]
}
if(length(which(is.na(sequence_data$Year)))!= 0){
sequence_data<-sequence_data[-c(which(is.na(sequence_data$Year))),]
}
#############################################
# APPROPRIATE METADATA #
#############################################
sequence_data$Country<-gsub("-", NA, sequence_data$Country)
nodes_diff$country<-NA
#' Identify the countries the sequences descended from the nodes of interest are from
for (i in 1:length(nodes_diff$node)) {
countries<-unique(sequence_data$Country[
which(sequence_data$ID %in% tree$tip.label[Descendants(tree, nodes_diff$node[i], "tips")[[1]]])])
if(length(which(is.na(countries))) != 0) {
countries<-countries[-c(which(is.na(countries)))]
}
if (length(countries) != 1){
nodes_diff$country[i]<-list(c(countries))
} else {
nodes_diff$country[i]<-countries
}
}
nodes_diff$year_first<-NA
nodes_diff$year_last<-NA
#' Identify the first and most recent years of detection for the potential lineages
for (i in 1:length(nodes_diff$node)) {
nodes_diff$year_first[i]<-min(sequence_data$Year[
which(sequence_data$ID %in% tree$tip.label[Descendants(tree, nodes_diff$node[i], "tips")[[1]]])])
nodes_diff$year_last[i]<-max(sequence_data$Year[
which(sequence_data$ID %in% tree$tip.label[Descendants(tree, nodes_diff$node[i], "tips")[[1]]])])
}
nodes_diff$year_last<-as.integer(nodes_diff$year_last)
nodes_diff$year_first<-as.integer(nodes_diff$year_first)
#' Only carry forwards potential lineages where all sequences are from a 5 year or less time period
nodes_diff<-nodes_diff[which((nodes_diff$year_last - nodes_diff$year_first) < 5),]
if (length(nodes_diff$node) != 0) {
#############################################
# NAMING #
#############################################
emerging_lineages<-nodes_diff[c(1:3, 6, 8:10)]
#' Extract the useful information about the emerging/undersampled lineages
#' Add an Ex suffix to denote an emerging/undersampled lineage, where x indicates multiple emerging/undersampled
#' lineages descended from the same parent lineage
emerging_lineages$lineage<-paste(emerging_lineages$lineage, "E1", sep = "_")
emerging_lineages$lineage[
which(duplicated(emerging_lineages$lineage))]<-
gsub("_E1", "_E2", emerging_lineages$lineage[which(duplicated(emerging_lineages$lineage))])
emerging_lineages$lineage[
which(duplicated(emerging_lineages$lineage))]<-
gsub("_E2", "_E3", emerging_lineages$lineage[which(duplicated(emerging_lineages$lineage))])
emerging_lineages$lineage[
which(duplicated(emerging_lineages$lineage))]<-
gsub("_E3", "_E4", emerging_lineages$lineage[which(duplicated(emerging_lineages$lineage))])
emerging_lineages$lineage[
which(duplicated(emerging_lineages$lineage))]<-
gsub("_E4", "_E5", emerging_lineages$lineage[which(duplicated(emerging_lineages$lineage))])
emerging_lineages$lineage[
which(duplicated(emerging_lineages$lineage))]<-
gsub("_E5", "_E6", emerging_lineages$lineage[which(duplicated(emerging_lineages$lineage))])
emerging_lineages$lineage[
which(duplicated(emerging_lineages$lineage))]<-
gsub("_E6", "_E7", emerging_lineages$lineage[which(duplicated(emerging_lineages$lineage))])
emerging_lineages$lineage[
which(duplicated(emerging_lineages$lineage))]<-
gsub("_E7", "_E8", emerging_lineages$lineage[which(duplicated(emerging_lineages$lineage))])
emerging_lineages$lineage[
which(duplicated(emerging_lineages$lineage))]<-
gsub("_E8", "_E9", emerging_lineages$lineage[which(duplicated(emerging_lineages$lineage))])
emerging_lineages$lineage[
which(duplicated(emerging_lineages$lineage))]<-
gsub("_E9", "_E10", emerging_lineages$lineage[which(duplicated(emerging_lineages$lineage))])
emerging_lineages$lineage[
which(duplicated(emerging_lineages$lineage))]<-
gsub("_E10", "_E11", emerging_lineages$lineage[which(duplicated(emerging_lineages$lineage))])
emerging_lineages$lineage[
which(duplicated(emerging_lineages$lineage))]<-
gsub("_E11", "_E12", emerging_lineages$lineage[which(duplicated(emerging_lineages$lineage))])
emerging_lineages$lineage[
which(duplicated(emerging_lineages$lineage))]<-
gsub("_E12", "_E13", emerging_lineages$lineage[which(duplicated(emerging_lineages$lineage))])
emerging_lineages$lineage[
which(duplicated(emerging_lineages$lineage))]<-
gsub("_E13", "_E14", emerging_lineages$lineage[which(duplicated(emerging_lineages$lineage))])
emerging_lineages$lineage[
which(duplicated(emerging_lineages$lineage))]<-
gsub("_E14", "_E15", emerging_lineages$lineage[which(duplicated(emerging_lineages$lineage))])
emerging_lineages$lineage[
which(duplicated(emerging_lineages$lineage))]<-
gsub("_E15", "_E16", emerging_lineages$lineage[which(duplicated(emerging_lineages$lineage))])
emerging_lineages$lineage[
which(duplicated(emerging_lineages$lineage))]<-
gsub("_E16", "_E17", emerging_lineages$lineage[which(duplicated(emerging_lineages$lineage))])
emerging_lineages$lineage[
which(duplicated(emerging_lineages$lineage))]<-
gsub("_E17", "_E18", emerging_lineages$lineage[which(duplicated(emerging_lineages$lineage))])
emerging_lineages<-emerging_lineages[,2:7]
emerging_lineages$country<-paste(emerging_lineages$country)
write.csv(emerging_lineages, paste(args, "/Outputs/emerging_undersampled.csv", sep = ""), row.names = F)
print("Emerging or undersampled lineages written to Outputs.")
} else {
print("No emerging or undersampled lineages detected in relevant lineages.")
}
} else {
print("No emerging or undersampled lineages detected in relevant lineages.")
}
#'**EMERGING/UNDERSAMPLED LINEAGES**
#############################################
# LONG BRANCHES #
#############################################
lengths<-data.frame(setNames(tree$edge.length[sapply(1:length(tree$tip.label),
function(x,y) which (y==x),y=tree$edge[,2])],
tree$tip.label))
#' Identify all the branch lengths from each tip to the nearest node
colnames(lengths)<-"lengths"
seqs<-rownames(lengths)[which(lengths$lengths >= quantile(lengths$lengths, .95))]
#' Identify the longest 5% of branch lengths and which tips these correspond to
seqs<-seqs[which(seqs %in% alignment$nam)]
seqs<-seqs[which(seqs %in% sequence_data$ID)]
lengths<-data.frame(ID = rownames(lengths), length = lengths$lengths)
lengths<-lengths[which(lengths$ID %in% seqs),]
lengths$close<-NA
lengths$lineage<-NA
lengths$cutoff<-NA
for (i in 1:length(lengths$ID)) {
lengths$lineage[i]<-sequence_data$lineage[which(sequence_data$ID == lengths$ID[i])]
}
#' Identify which lineage each of the diverse sequences belongs to
#############################################
# SIGNIFICANT DIVERSITY #
#############################################
for (i in 1:length(lengths$ID)) {
sequences<-sequence_data$ID[which(sequence_data$lineage == lengths$lineage[i])]
subset<-distances[which(rownames(distances)%in% sequences),
which(colnames(distances)%in% sequences)]
lengths$cutoff[i]<-quantile(subset, 0.95)
}
#' Identify the 95th percentile for patristic distances in each lineage
#' Identify the closest relative sequence. If the patristic distance between the query sequence and its
#' closest relative is at elast the 95th percentile of the lineage, list the relative. If not, 'NA'
for (i in c(1:length(lengths$ID))) {
if (which(alignment$nam == lengths$ID[i]) == length(alignment$nam)){
up<-NA
}else{
up<-alignment$nam[(which(alignment$nam == lengths$ID[i]))+1][[1]]
}
if (which(alignment$nam == lengths$ID[i]) == 1) {
down<-NA
}else{
down<-alignment$nam[(which(alignment$nam == lengths$ID[i]))-1][[1]]
}
test<-which(tree$tip.label == lengths$ID[i])
if (is.na(up)) {
up<-0
} else {
up<-which(tree$tip.label == up)
up<-castor::get_pairwise_distances(tree, test, up)
}
if (is.na(down)) {
down<-0
} else {
down<-which(tree$tip.label == down)
down<-castor::get_pairwise_distances(tree, test, down)
}
if(down > up && down >= lengths$cutoff[i]) {
lengths$close[i]<-alignment$nam[(which(alignment$nam == lengths$ID[i]))-1]
} else {
if (up >= lengths$cutoff[i]) {
lengths$close[i]<-alignment$nam[(which(alignment$nam == lengths$ID[i]))+1]
} else {
lengths$close[i] <- NA
}
}
}
if(length(which(is.na(lengths$close))) != 0) {
lengths<-lengths[-c(which(is.na(lengths$close))),]
}
#' remove any with NA signifying close relatives
#############################################
# OUTPUT #
#############################################
#' Make a table of information about the number of diverse singletons in each lineage, and where/when they
#' are from.
SOI<-data.frame(lineage = lineage_info$lineage, n_singletons = NA, singleton_countries = NA, singleton_years = NA)
SOI<-SOI[which(SOI$lineage %in% lengths$lineage),]
for (i in 1:length(SOI$lineage)) {
SOI$n_singletons[i]<-length(which(lengths$lineage == SOI$lineage[i]))
}
for (i in 1:length(lengths$lineage)) {
lengths$year[i]<-metadata$year[which(metadata$ID == lengths$ID[i])]
lengths$country[i]<-metadata$country[which(metadata$ID == lengths$ID[i])]
}
for (i in 1:length(SOI$lineage)) {
SOI$singleton_countries[i]<-list(unique(lengths$country[which(lengths$lineage == SOI$lineage[i])]))
SOI$singleton_years[i]<-list(unique(lengths$year[which(lengths$lineage == SOI$lineage[i])]))
}
SOI$singleton_countries<-paste(SOI$singleton_countries)
SOI$singleton_years<-paste(SOI$singleton_years)
print("Singletons of interest written to Outputs.")
write.csv(SOI, paste(args, "/Outputs/singletons_of_interest.csv", sep = ""), row.names = F)
write.csv(lengths, paste(args, "/Outputs/singleton_details.csv", sep = ""), row.names = F)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.