In KathrynCampbell/MADDOG: Lineage Designation and Assignment

knitr::opts_chunk$set(echo = TRUE)

args = commandArgs(trailingOnly = T)

library(dplyr)
library(ggplot2)
library(knitr)

metadata<-read.csv(paste("./", args, "/", args, "_metadata.csv", sep = ""))

if(file.exists(paste("./", args, "/Outputs/sequence_data.csv", sep = "")) == T) {
  sequence_data<-read.csv(paste("./", args, "/Outputs/sequence_data.csv", sep = ""))
} else {
  sequence_data<-read.csv(paste("./", args, "/", args, "_assignment.csv", sep = ""))
  sequence_data$year<-NA
  for (i in 1:length(sequence_data$ID)) {
    sequence_data$year[i]<-metadata$year[which(metadata$ID == sequence_data$ID[i])]
  }
}


relevant_lineages<-read.csv(paste("./", args, "/Outputs/relevant_lineages.csv", sep = ""))

if(file.exists(paste("./", args, "/Outputs/new_lineages.csv", sep = "")) == T) {
  new_lineages<-read.csv(paste("./", args, "/Outputs/new_lineages.csv", sep = ""))
} else {
  new_lineages<-NA
}

if(file.exists(paste("./", args, "/Outputs/emerging_undersampled.csv", sep = "")) == T) {
  emerging_undersampled<-read.csv(paste("./", args, "/Outputs/emerging_undersampled.csv", sep = ""))
} else {
  emerging_undersampled<-NA
}

if(file.exists(paste("./",args, "/Outputs/singletons_of_interest.csv", sep = "")) == T) {
  soi<-read.csv(paste("./",args, "/Outputs/singletons_of_interest.csv", sep = ""))
} else {
  soi<-NA
}

if(file.exists(paste("./",args, "/Outputs/singleton_details.csv", sep = "")) == T) {
  soi_details<-read.csv(paste("./",args, "/Outputs/singleton_details.csv", sep = ""))
} else {
  soi_details<-NA
}

```{css, echo=FALSE} h1, h2{ text-align: center; font-family: Helvetica; font-weight: bold; }

body{ margin-top: 75px; font-family: Helvetica; font-weight: lighter; font-size: 14pt; }

$~$

## Study Overview

$~$
$~$

```r
cat(paste("This study contains ", length(metadata$ID), " sequences from between ", min(metadata$year), " and ", max(metadata$year), ".", sep = ""))

cat(paste("This sequence data came from", length(unique(metadata$country)), "different countries. "))

regions<-data.frame(region=unique(metadata$country), count=NA)

for (i in 1:length(regions$region)) {
  regions$count[i]<-length(which(metadata$country == regions$region[i]))
}

names(regions)<-c("Country", "Number of Sequences")

kable(regions, row.names=F)

Table 1. Numbers of sequences by area.

$~$ $~$

\newpage

Lineages Overview

$~$ $~$

Several well-defined RABV clades circulate globally, within two major phylogenetic groups; bat-related and dog-related. The dog-related group is split into 6 different clades according to Troupin et al. (2016). These clades are: Africa 2, Africa 3, Cosmopolitan, Arctic, Asian and Indian. The majority of Nigerian sequences fall within the Africa 2 clade.

Clades Figure 1. Global rabies clades. Phylogeny of all global rabies clades as defined by Troupin et al. (2016). taken from https://doi.org/10.1371/journal.ppat.1006041

$~$

The MAD DOG (Method for Assignment, Definition and Designation Of Global lineages) tool is an updated lineage designation and assignment tool for rabies virus based on the dynamic nomenclature used for SARS-CoV-2 by Rambaut et al. (2020). This tool defines sequences beyond the clade and subclade level, allowing increased definition. Application of this tool can be used to generate detailed information to inform control efforts and monitor progress towards the elimination of rabies virus.

$~$

Details of the tool can be found at https://doi.org/10.5281/zenodo.5503916

$~$

For more information see: https://doi.org/10.1101/2021.10.13.464180

$~$ $~$ \newpage

if(file.exists(paste("./", args, "/Outputs/new_lineages.csv", sep = "")) == T) {
  cat(paste("A total of", length(unique(sequence_data$lineage)), "lineages have been detected in this study.", (length(relevant_lineages$lineage)+length(new_lineages$lineage)) - length(unique(sequence_data$lineage)), "lineages are included here that have not been seen in this study, but are direct parents of lineages in this study, so are included for relevant evolutionary investigations."))

  cat(paste("There are ", length(relevant_lineages$lineage), " existing lineages relevant to this study."))
  kable(relevant_lineages, row.names=F)
} else {
  cat(paste("No new lineages were detected within this dataset."))
}

Table 2. Details of lineages relevant to this study. First and last years refer to the first and most recent years the lineage has been detected prior to this study.

if(file.exists(paste("./", args, "/Outputs/new_lineages.csv", sep = "")) == T) {
  if(length(new_lineages) != 0) {
    cat(paste("There are ", length(new_lineages$lineage), " new lineages identified in this dataset."))
    kable(new_lineages, row.names=F)
  }
} else {
  cat(paste("There are no new lineages identified in this dataset."))
}

Table 3. Details of new lineages identified in this study. First and last years refer to the first and most recent years the lineage has been detected.

if(file.exists(paste("./", args, "/Outputs/new_lineages.csv", sep = "")) == T) {
  if(length(new_lineages) != 0) {
  lineage_info<-rbind(new_lineages, relevant_lineages)
  }
} else {
  lineage_info<-relevant_lineages
}

lineage_info$colour<-NA

  Colours<-c("Reds","Purples","YlOrBr","PuBuGn","YlOrRd","OrRd","PuBu","Pastel1","Greens","Greys",
             "GnBu","BuGn","RdPu","Oranges","BuPu","YlGn","PuRd","YlGnBu")

  lineages<-data.frame(lineage = lineage_info$lineage, subclade = NA)

  for (i in 1:length(lineages$lineage)) {
    lineages$subclade[i]<-strsplit(lineages$lineage[i], "_")[[1]][1]
  }

  letters <- c("A1", "B1", "C1", "D1", "E1", "F1", "G1", "H1", "I1", "J1", "K1", "L1", "M1", "N1",
               "O1", "P1", "Q1", "R1", "S1", "T1", "U1", "V1", "W1", "X1", "Y1", "Z1")

  if(length(grep("_", lineage_info$lineage)) != 0) {
    if (length(which(lineages$subclade %in% letters)) != 0) {
      lineages<-lineages[-c(which(lineages$subclade %in% letters)),]
    }
  }

  clades<-unique(lineages$subclade)

  if(length(grep("\\.", clades)) != 0 ) {
    clades<-clades[-c(grep("\\.", clades))]
  }

  lineage<-lineage_info$lineage[-c(grep("_", lineage_info$lineage))]
  cols<-RColorBrewer::brewer.pal(9, "Blues")
  pal<-colorRampPalette(c(cols))
  pal<-rev(pal(length(lineage)))
  lineage_info$colour[-c(grep("_", lineage_info$lineage))]<-pal

  for (i in 1:length(clades)) {
    lineage<-grep(clades[i], lineage_info$lineage)
    cols<-RColorBrewer::brewer.pal(3, Colours[i])
    pal<-colorRampPalette(c(cols))
    pal<-rev(pal(length(lineage)))
    lineage_info$colour[(grep(clades[i], lineage_info$lineage))]<-pal
  }

  new<-plotly::plot_ly(
    labels = c(lineage_info$lineage),
    parents = c(lineage_info$parent),
    values = c(lineage_info$n_seqs),
    type = "sunburst",
    marker = list(colors = (lineage_info$colour))
  )

  new

Figure 2. Sunburst plot showing hierarchal relationships of lineages. Bar length corresponds to number of sequences. Plot is interactive. Hover over bars to see details, and click to zoom in on sections. \newpage

Lineage Changes Over Time

cat(paste("The sequences span ", as.integer(max(metadata$year)) - as.integer(min(metadata$year)), " years from ", min(metadata$year), " to ", max(metadata$year),". ", sep = ""))

getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}

cat(paste("The year with the greatest number of sequences is ", getmode(sequence_data$year), " with ", length(which(sequence_data$year == getmode(sequence_data$year))), " sequences."))

cat(paste("The most prevalent lineage is ", getmode(sequence_data$lineage), " with ", length(which(sequence_data$lineage == getmode(sequence_data$lineage))), " sequences."))

data_sub<-sequence_data

graph<-ggplot(data_sub, aes(x = year, fill = lineage)) + 
  geom_bar() + theme(legend.position = "bottom") + theme(axis.text.x = element_text(size = 7)) + 
  theme(legend.position="none") +
  theme(axis.title.x=element_blank()) 
lin<-data.frame(table(data_sub$lineage))
names(lin)<-c("lineage", "count")
pie<-ggplot(lin, aes(x="", y=count, fill=lineage)) +
  geom_bar(stat="identity", width=1, color="white") +
  coord_polar("y", start=0) +
  theme_void()+   theme(legend.position="right") + theme(legend.text = element_text(size=7)) +theme(legend.key.size = unit(0.45,"cm"))+theme(legend.title = element_blank())
gridExtra::grid.arrange(graph,pie)

$~$

Figure 4. Above: Number of sequences per year, with bars split by lineage. Below: Pie chart indicating proportions of lineages.

Areas to Investigate

if(file.exists(paste("./", args, "/Outputs/emerging_undersampled.csv", sep = "")) == T){
  cat(paste("There are ", length(emerging_undersampled$lineage), " potentially emerging or undersampled lineages within the lineages relevant to this study. This means that these lineages have between 5 and 9 sequences; not enough to be a full lineage, but are significantly diverse from their relatives. With more sequencing and more time, these are likely to become new lineages. Be aware these include ALL emerging or undersampled lineages within the relevant lineages in your dataset; not just your data. This is to show there may be gaps in the data."))
  kable(emerging_undersampled, row.names = F) 
} else {
  cat(paste("There are no potentially emerging or undersampled lineages in this dataset!"))
}

$~$ Table 5. Details of potentially emerging or undersampled lineages relevant to this study. First and last years refer to the first and most recent years the lineage has been detected.

if(file.exists(paste("./",args, "/Outputs/singletons_of_interest.csv", sep = "")) == T){
  cat(paste("There are ", sum(soi$n_singletons), " singletons of interest detected. These reflect highly divergent sequences that could indicate sequencing errors, or the start of new lineages, especially in undersampled areas. Be aware these include ALL singletons of interest within the relevant lineages in your dataset; not just your data. This is to show there may be general gaps in the data."))
  kable(soi, row.names = F) 
} else {
  cat(paste("There are no singletons of interest in this dataset!"))
}

$~$ Table 6. Summary of singletons of interest relevant to this study.

if(file.exists(paste("./",args, "/Outputs/singleton_details.csv", sep = "")) == T){
  soi_details<-soi_details[,-c(2,5)]
  names(soi_details)[2]<-("closest relative")
  kable(soi_details, row.names = F) 
} else {
  cat(paste("There are no singletons of interest in this dataset!"))
}