In serratus-bio/tantalus: R Data Exploration for Serratus

Setup ========================================

# Install Tantalus Dependencies
#install.packages("devtools")
library(devtools)

# Install roxygen2 for development
#install.packages("roxygen2")
library(roxygen2)
roxygenize()

Access SQL database

# Load packages
library("tantalus")
# library("dbplyr")
library("reshape2")
library("RPostgres")

drv <- DBI::dbDriver("PostgreSQL")
con <- DBI::dbConnect(drv,
                     user="public_reader",
                     password="serratus",
                     host="serratus-aurora-20210306.cluster-ro-ccz9y6yshbls.us-east-1.rds.amazonaws.com",
                     port=5432,
                     dbname="summary")

## Tables: AccessionSections, FamilySections, FastaSections, Runs

Retreive all Coronavirus Family entries

#get Family table, and Coronaviridae family
CoV <- readDfSQL(con,
                 table = "rfamily",
                 family = "Coronaviridae-1",
                 columns = c('sra_id', 'score', 'percent_identity', 'family_name'),
                 score = 24,
                 dataframe = T)

# Plot
#familyScorPctID(CoV, "Coronaviridae")
pctidInScoreIntervals(CoV, title="Coronaviridae",
                      nucleotide = F)

# Retrive all viral familiy-level hits above score 79 as df
score_threshold <- 79

Viro <- readDfSQL(con, "nfamily",
                 columns = c("sra_id","score", "percent_identity", "family_name"),
                 score = score_threshold,
                 dataframe = T)
colnames(Viro) <- tolower(colnames(Viro))

# Deplete AMR sets
Viro <- Viro[ !(Viro$family == "AMR"), ]

# Read taxid for viral famalies
vtax <- read.table( file = "data/virus_taxid.tsv", header = TRUE)
  levels(vtax$family_name)[27] = "Unknown"
  RNAtaxa <- 1:26 # RNA viruses in vtax

# Summarize Counts Globally
Viro2 <- Viro %>% group_by(family) %>% count(pctid)
Viro2 <- cast(Viro2, family~pctid, fill = 0)

# Clean-up table - RNA viruses only, tax order
Viro2 <- Viro2[ match(vtax$family_name[RNAtaxa], Viro2$family), ]

# Melt-down for plotting
Viro2 <- melt(Viro2)
Viro2 <- Viro2[ order(Viro2$family), ]
Viro2$family <- factor(Viro2$family, levels = rev(unique(vtax$family_name)))
Viro2$pctid  <- factor(Viro2$pctid)
#Viro2$pctid  <- factor(Viro2$pctid,  levels = rev(unique(Viro2$pctid)))

# Plot Family-summaries
pViro <- ggplot(Viro2, aes(family, value, fill = pctid)) +
  coord_flip() +
  geom_bar(stat = 'identity') +
  scale_fill_viridis(discrete = TRUE) + theme_bw()
pViro

# Plot Family-summaries
Viro3 <- Viro2[ (as.numeric(as.character(Viro2$pctid)) < 91), ]
pViro <- ggplot(Viro3, aes(family, value, fill = pctid)) +
  coord_flip() +
  geom_bar(stat = 'identity') +
  scale_fill_viridis(discrete = TRUE) + theme_bw()
pViro

# Under the lamp-post plot
ViroLP <- Viro %>% group_by(family) %>% count(pctid)
ViroLP <- cast(ViroLP, family~pctid, fill = 0)

# Clean-up table - RNA viruses only, tax order
ViroLP <- ViroLP[ match(vtax$family_name[RNAtaxa], ViroLP$family), ]

# Take Right-Sum for each column
Rcumsum <- function(X){
  return(rev(cumsum(rev(as.numeric(X)))))
}

# Take cumulative sum of values
nCol <- length(ViroLP[1,])

# Weird behaviour in apply/sapply. Use for loop
for (i in 1:nrow(ViroLP)){
  ViroLP[i , 2:nCol] <- Rcumsum( ViroLP[i , 2:nCol])
}


# Melt-down for plotting
ViroLP2 <- melt(ViroLP)
ViroLP2$family <- factor(ViroLP2$family, levels = rev(unique(vtax$family_name)))
ViroLP2 <- ViroLP2[ order(ViroLP2$family), ]

# X offset
ViroLP2$X <- as.numeric(factor(ViroLP2$family))*100

# Y offset
ViroLP2$Y  <- factor(vtax$order[match(ViroLP2$family, vtax$family_name)])
ViroLP2$Y2 <- as.numeric(ViroLP2$Y)*100

# Summaries
ViroLPS <- ViroLP2[ !duplicated(ViroLP2$family), ]

# Plot "Under the lamp-post"
pViro <- ggplot(ViroLP2, aes(X, Y, color = pctid, size = value)) +
  geom_point() +
  scale_size("value") + scale_size_area() +
  scale_color_viridis() + theme_bw()
pViro

#get Family table, and Coronaviridae family
CoV <- readDfSQL(con, "FamilySections", family = "Coronaviridae")

#convert database output into dataframe
CoV <- readDfSQL(con, "FamilySections", family = "Coronaviridae",
                 columns = c("Sra","Score", "PctId", "Family"),
                 score = 24,
                 dataframe = T)

colnames(CoV) <- tolower(colnames(CoV))

pctidInScoreIntervals(CoV, "Coronaviridae", title="Coronaviridae",
                      scale_log = F, bin_scores = F)

#working with facet_wrap
drv <- DBI::dbDriver("PostgreSQL")
con <- DBI::dbConnect(drv,
                      user="postgres",
                      password="serratus",
                      host="big-parse-db.ccz9y6yshbls.us-east-1.rds.amazonaws.com",
                      port=5432,
                      dbname="postgres")

vtax <- read.table( file = "data/virus_taxid.tsv", header = TRUE)
levels(vtax$family_name)[27] = "Unknown"
RNAtaxa <- 1:26 # RNA viruses in vtax
DNAtaxa <- 27:42 # DNA viruses

rna_families <- vtax$family_name[RNAtaxa]
dna_families <- vtax$family_name[DNAtaxa]

RNA_spec <- readDfSQL(con, "FamilySections", family = rna_families,
                 columns = c("Sra","Score", "PctId", "Family"),
                 score = 20,
                 dataframe = T)

DNA_spec <- readDfSQL(con, "FamilySections", family = dna_families,
                 columns = c("Sra","Score", "PctId", "Family"),
                 score = 59,
                 dataframe = T)

colnames(RNA_spec) <- tolower(colnames(RNA_spec))
colnames(DNA_spec) <- tolower(colnames(DNA_spec))

p <- pctidInScoreIntervals(RNA_spec, family_name = NULL, title="",
                      scale_log = F, bin_scores = F)

p <- p + facet_wrap(~ family, ncol=4, scales="free_y")

p

d <- pctidInScoreIntervals(DNA_spec, family_name = NULL, title="",
                      scale_log = F, bin_scores = F)
d <- d + facet_wrap(~ family, ncol=4, scales="free_y")

d

# Deconstruct SQL query for family-level summary plots
# for Serratus.io "Family" page

#get Family table, and Coronaviridae family
sIO <- readDfSQL(con, "FamilySections")

#convert database output into dataframe
sIO <- readDfSQL(con, "FamilySections",
                 columns = c("Sra","Score", "PctId", "Family"),
                 dataframe = T)

colnames(sIO) <- tolower(colnames(sIO))

# Temp holder in case of error
sIO_holder <- sIO

## Group data by score/pctid
sIO$score <- factor(sIO$score)
sIO       <- sIO %>% group_by(pctid, score, family) %>% summarize(n=n(), log10n = log10(n() + 1))
sIO       <- sIO[,c(3,1,2,4)]

# Order by Family / score / pctid
sIO       <- sIO[order(sIO$pctid, decreasing = T), ]
sIO       <- sIO[order(sIO$score, decreasing = T), ]
sIO       <- sIO[order(sIO$family), ]

# Write csv output
write.csv(sIO, file = 'SerratusIO_scoreID.csv', row.names = F, quote = F)

release_stats <- read.table(file = 'release2_stats.txt', header = T)

#> sum(release_stats$bases)
#[1] 5.620087e+15
#> sum(release_stats$size_MB)
#[1] 2663778088

# Correlation matrix between virus occurence
library(ggcorrplot)

# Convert Viro df to a binary matrix of virus family
Viro3 <- dcast(Viro, sra~family, fun.aggregate = function(x){as.integer(length(x) > 0)} )

# Calculate corrleation matrix
Vcorr <- round(cor(Viro3[ ,-1]), 1)
p.corr <- cor_pmat(Vcorr)

# Visualize correlation matrix
ggcorrplot(Vcorr, hc.order = TRUE,
            type = "upper", outline.col = "white")

# Download data directory - Meta-Meta-Virome
# This will download ~8.5K summary files for all viromes
if ( !dir.exists('data/200528_viro') ){
  getSummary(s3_path    = 's3://serratus-public/out/200528_viro/summary',
             local_path = 'data/200528_viro')
}

# Download SraRunInfo File - Meta-Meta-Virome
if ( !file.exists('sra/viro_SraRunInfo.csv') ){

  getSraRunInfo(runInfo = "viro_SraRunInfo.csv",
                s3_path = "s3://serratus-public/out/200528_viro/",
                local_path = "sra")
}

# Download Sequence Reference - cov3ma
if ( !file.exists('seq/cov3ma/cov3ma.Rdata') ){
  getReference( refName = "cov3ma",
                s3_path = "s3://lovelywater2/seq/",
                local_path = "seq")
}

# Parse Serratus .summary Files into Serratus Object
# from a directory of summary data
DATA_PATH='data/test/'
sumFiles <- paste0( DATA_PATH,
                    system( paste0("ls ", DATA_PATH), intern = T ))
TEST <- readSerratus(sumFiles = sumFiles)

# Parse SraRunInfo.csv file into a SraRunInfo Object
sra_path='sra/viro_SraRunInfo.csv'
RUNINFO <- readSraRunInfo(sra_path)

library("dplyr")
library("ggplot2")
load("data/viro.RData")

families <- unique(ZOO@family$family)
Coronaviridae  <- ZOO@family %>% filter(family %in% c("Coronaviridae"))
Coronaviridae <- Coronaviridae %>% filter(score >= 75)
Coronaviridae <- Coronaviridae %>% filter(pctid <= 92)

for (sra in Coronaviridae$sra){
  getBAM(sra, s3_path = 's3://serratus-public/out/200528_viro')
}


ggplot(data=Coronaviridae, aes(Coronaviridae$aln)) +
  geom_histogram( bins = 100) +
  theme_bw() + scale_x_log10() +
  labs(x = "Aligned Reads") +
  labs(title = "Coronaviridae family")

ggplot(data=Coronaviridae, aes(Coronaviridae$score)) +
  geom_histogram(fill="blue", bins = 100) +
  theme_bw() +
  labs(x = "Family score") +
  labs(title = "Coronaviridae family")

  p <- ggplot(data=Coronaviridae2, aes(x = score, y = pctid)) +
  geom_jitter(color="blue", alpha = 0.4) +
  theme_bw() +
  labs(x = "Family score", y = "Percentage identity") +
  labs(title = "Coronaviridae family")
ggExtra::ggMarginal(p, type = "histogram")

p <- ggplot(data=Coronaviridae3, aes(x = score, y = pctid)) +
  geom_jitter(color="blue", alpha = 0.4) +
  theme_bw() +
  labs(x = "Family score", y = "Percentage identity") +
  labs(title = "Coronaviridae family")
ggExtra::ggMarginal(p, type = "histogram")

p1 <- ggplot(data=Coronaviridae, aes(pctid)) +
  geom_histogram( bins = 30) +
  theme_bw() + xlim(c(75,100)) +
  labs(x = "Percentage Identity") +
  labs(title = "Coronaviridae - all")


p2 <- ggplot(data=Coronaviridae[which(Coronaviridae$score >= 50), ], aes(pctid)) +
  geom_histogram( bins = 30) +
  theme_bw() + xlim(c(75,100)) +
  labs(x = "Percentage Identity") +
  labs(title = "Coronaviridae - 50+")


p3 <- ggplot(data=Coronaviridae[which(Coronaviridae$score >= 60), ], aes(pctid)) +
  geom_histogram( bins = 30) +
  theme_bw() + xlim(c(75,100)) +
  labs(x = "Percentage Identity") +
  labs(title = "Coronaviridae - 60+")

p4 <- ggplot(data=Coronaviridae[which(Coronaviridae$score >= 70), ], aes(pctid)) +
  geom_histogram( bins = 30) +
  theme_bw() + xlim(c(75,100)) +
  labs(x = "Percentage Identity") +
  labs(title = "Coronaviridae - 70+")

p5 <- ggplot(data=Coronaviridae[which(Coronaviridae$score >= 80), ], aes(pctid)) +
  geom_histogram( bins = 30) +
  theme_bw() + xlim(c(75,100)) +
  labs(x = "Percentage Identity") +
  labs(title = "Coronaviridae - 80+")

p6 <- ggplot(data=Coronaviridae[which(Coronaviridae$score >= 90), ], aes(pctid)) +
  geom_histogram( bins = 30) +
  theme_bw() + xlim(c(75,100)) +
  labs(x = "Percentage Identity") +
  labs(title = "Coronaviridae - 90+")

library(grid)
grid.newpage()
grid.draw(rbind(ggplotGrob(p1),
                ggplotGrob(p2),
                ggplotGrob(p3),
                ggplotGrob(p4),
                ggplotGrob(p5),
                ggplotGrob(p6),
                size = "last"))

library(fst)
library(tantalus)
library(stringr)

# Serratus Data Release 200612 -- Family Hits
DATA_PATH='data/200612_df/Acc/'
sumWheres <- paste0( DATA_PATH,
                     system( paste0("ls ", DATA_PATH), intern = T ))

# See SRA Queries
scRNA <- read.csv(file = 'data/scRNA_SraRunInfo.csv')
scRNA <- scRNA$Run
save(scRNA, file = "data/scRNA.RData")

#read all columns
COV      <- readDFs(sumWheres,
                    columns = c("sra","family","score","pctid","aln","cvg","top","topname"),
                    family = "Coronaviridae")

HDV     <- readDFs(sumWheres,
                   columns = c("sra","pctid","aln","cvg","acc","name"),
                   acc = "NC_001653.2")

HDV$no_read_bin <- str_count(HDV$cvg, "_")

RATG13  <- readDFs(sumWheres,
                   columns = c("sra","pctid","aln","cvg","acc","name"),
                   acc = "MN996532.1"   )
RATG13$no_read_bin <- str_count(RATG13$cvg, "_")


HDV     <- readDFs(sumWheres,
                   columns = c("sra","pctid","aln","cvg","acc","name"),
                   acc = "NC_001653.2")
HDV$no_read_bin <- str_count(HDV$cvg, "_")

SDV     <- readDFs(sumWheres,
                   columns = c("sra","pctid","aln","cvg","acc","name"),
                   acc = "NC_040729.1")
SDV$no_read_bin <- str_count(SDV$cvg, "_")

library(fst)
library(tantalus)
library(stringr)

# Serratus Data Release 200612 -- Family Hits
DATA_PATH='data/200612_df/F/'
sumWheres <- paste0( DATA_PATH,
                     system( paste0("ls ", DATA_PATH), intern = T ))

POX      <- readDFs(sumWheres,
                    columns = c("sra","family","score","pctid","aln","cvg","top","topname"),
                    family = "Poxviridae")

POX2 <- POX[ POX$score > 90, ]

POX3 <- POX[ POX$topname == 'Salmon gill poxvirus;', ]

save(POX, POX2, file = '200626_POX.Rdata')
write.csv(POX3, file ='200626_Salmon_POX_lhf.csv')

library(tantalus)
load("data/cov.fam.200612.Rdata")
load("data/scRNA.Rdata")

scCOV <- COV$sra %in% scRNA

# 417 248
COV <- COV[ !scCOV, ]
#
COV2 <- COV[ COV$score > 20, ]
#
COV2 <- COV2[ COV2$aln > 100, ]