# Install Tantalus Dependencies #install.packages("devtools") library(devtools) # Install roxygen2 for development #install.packages("roxygen2") library(roxygen2) roxygenize()
# Load packages library("tantalus") # library("dbplyr") library("reshape2") library("RPostgres") drv <- DBI::dbDriver("PostgreSQL") con <- DBI::dbConnect(drv, user="public_reader", password="serratus", host="serratus-aurora-20210306.cluster-ro-ccz9y6yshbls.us-east-1.rds.amazonaws.com", port=5432, dbname="summary") ## Tables: AccessionSections, FamilySections, FastaSections, Runs
#get Family table, and Coronaviridae family CoV <- readDfSQL(con, table = "rfamily", family = "Coronaviridae-1", columns = c('sra_id', 'score', 'percent_identity', 'family_name'), score = 24, dataframe = T) # Plot #familyScorPctID(CoV, "Coronaviridae") pctidInScoreIntervals(CoV, title="Coronaviridae", nucleotide = F)
# Retrive all viral familiy-level hits above score 79 as df score_threshold <- 79 Viro <- readDfSQL(con, "nfamily", columns = c("sra_id","score", "percent_identity", "family_name"), score = score_threshold, dataframe = T) colnames(Viro) <- tolower(colnames(Viro)) # Deplete AMR sets Viro <- Viro[ !(Viro$family == "AMR"), ] # Read taxid for viral famalies vtax <- read.table( file = "data/virus_taxid.tsv", header = TRUE) levels(vtax$family_name)[27] = "Unknown" RNAtaxa <- 1:26 # RNA viruses in vtax
# Summarize Counts Globally Viro2 <- Viro %>% group_by(family) %>% count(pctid) Viro2 <- cast(Viro2, family~pctid, fill = 0) # Clean-up table - RNA viruses only, tax order Viro2 <- Viro2[ match(vtax$family_name[RNAtaxa], Viro2$family), ] # Melt-down for plotting Viro2 <- melt(Viro2) Viro2 <- Viro2[ order(Viro2$family), ] Viro2$family <- factor(Viro2$family, levels = rev(unique(vtax$family_name))) Viro2$pctid <- factor(Viro2$pctid) #Viro2$pctid <- factor(Viro2$pctid, levels = rev(unique(Viro2$pctid))) # Plot Family-summaries pViro <- ggplot(Viro2, aes(family, value, fill = pctid)) + coord_flip() + geom_bar(stat = 'identity') + scale_fill_viridis(discrete = TRUE) + theme_bw() pViro # Plot Family-summaries Viro3 <- Viro2[ (as.numeric(as.character(Viro2$pctid)) < 91), ] pViro <- ggplot(Viro3, aes(family, value, fill = pctid)) + coord_flip() + geom_bar(stat = 'identity') + scale_fill_viridis(discrete = TRUE) + theme_bw() pViro
# Under the lamp-post plot ViroLP <- Viro %>% group_by(family) %>% count(pctid) ViroLP <- cast(ViroLP, family~pctid, fill = 0) # Clean-up table - RNA viruses only, tax order ViroLP <- ViroLP[ match(vtax$family_name[RNAtaxa], ViroLP$family), ] # Take Right-Sum for each column Rcumsum <- function(X){ return(rev(cumsum(rev(as.numeric(X))))) } # Take cumulative sum of values nCol <- length(ViroLP[1,]) # Weird behaviour in apply/sapply. Use for loop for (i in 1:nrow(ViroLP)){ ViroLP[i , 2:nCol] <- Rcumsum( ViroLP[i , 2:nCol]) } # Melt-down for plotting ViroLP2 <- melt(ViroLP) ViroLP2$family <- factor(ViroLP2$family, levels = rev(unique(vtax$family_name))) ViroLP2 <- ViroLP2[ order(ViroLP2$family), ] # X offset ViroLP2$X <- as.numeric(factor(ViroLP2$family))*100 # Y offset ViroLP2$Y <- factor(vtax$order[match(ViroLP2$family, vtax$family_name)]) ViroLP2$Y2 <- as.numeric(ViroLP2$Y)*100 # Summaries ViroLPS <- ViroLP2[ !duplicated(ViroLP2$family), ] # Plot "Under the lamp-post" pViro <- ggplot(ViroLP2, aes(X, Y, color = pctid, size = value)) + geom_point() + scale_size("value") + scale_size_area() + scale_color_viridis() + theme_bw() pViro
#get Family table, and Coronaviridae family CoV <- readDfSQL(con, "FamilySections", family = "Coronaviridae") #convert database output into dataframe CoV <- readDfSQL(con, "FamilySections", family = "Coronaviridae", columns = c("Sra","Score", "PctId", "Family"), score = 24, dataframe = T) colnames(CoV) <- tolower(colnames(CoV)) pctidInScoreIntervals(CoV, "Coronaviridae", title="Coronaviridae", scale_log = F, bin_scores = F)
#working with facet_wrap drv <- DBI::dbDriver("PostgreSQL") con <- DBI::dbConnect(drv, user="postgres", password="serratus", host="big-parse-db.ccz9y6yshbls.us-east-1.rds.amazonaws.com", port=5432, dbname="postgres") vtax <- read.table( file = "data/virus_taxid.tsv", header = TRUE) levels(vtax$family_name)[27] = "Unknown" RNAtaxa <- 1:26 # RNA viruses in vtax DNAtaxa <- 27:42 # DNA viruses rna_families <- vtax$family_name[RNAtaxa] dna_families <- vtax$family_name[DNAtaxa] RNA_spec <- readDfSQL(con, "FamilySections", family = rna_families, columns = c("Sra","Score", "PctId", "Family"), score = 20, dataframe = T) DNA_spec <- readDfSQL(con, "FamilySections", family = dna_families, columns = c("Sra","Score", "PctId", "Family"), score = 59, dataframe = T) colnames(RNA_spec) <- tolower(colnames(RNA_spec)) colnames(DNA_spec) <- tolower(colnames(DNA_spec)) p <- pctidInScoreIntervals(RNA_spec, family_name = NULL, title="", scale_log = F, bin_scores = F) p <- p + facet_wrap(~ family, ncol=4, scales="free_y") p
d <- pctidInScoreIntervals(DNA_spec, family_name = NULL, title="", scale_log = F, bin_scores = F) d <- d + facet_wrap(~ family, ncol=4, scales="free_y") d
# Deconstruct SQL query for family-level summary plots # for Serratus.io "Family" page #get Family table, and Coronaviridae family sIO <- readDfSQL(con, "FamilySections") #convert database output into dataframe sIO <- readDfSQL(con, "FamilySections", columns = c("Sra","Score", "PctId", "Family"), dataframe = T) colnames(sIO) <- tolower(colnames(sIO)) # Temp holder in case of error sIO_holder <- sIO ## Group data by score/pctid sIO$score <- factor(sIO$score) sIO <- sIO %>% group_by(pctid, score, family) %>% summarize(n=n(), log10n = log10(n() + 1)) sIO <- sIO[,c(3,1,2,4)] # Order by Family / score / pctid sIO <- sIO[order(sIO$pctid, decreasing = T), ] sIO <- sIO[order(sIO$score, decreasing = T), ] sIO <- sIO[order(sIO$family), ] # Write csv output write.csv(sIO, file = 'SerratusIO_scoreID.csv', row.names = F, quote = F)
release_stats <- read.table(file = 'release2_stats.txt', header = T) #> sum(release_stats$bases) #[1] 5.620087e+15 #> sum(release_stats$size_MB) #[1] 2663778088
# Correlation matrix between virus occurence library(ggcorrplot) # Convert Viro df to a binary matrix of virus family Viro3 <- dcast(Viro, sra~family, fun.aggregate = function(x){as.integer(length(x) > 0)} ) # Calculate corrleation matrix Vcorr <- round(cor(Viro3[ ,-1]), 1) p.corr <- cor_pmat(Vcorr) # Visualize correlation matrix ggcorrplot(Vcorr, hc.order = TRUE, type = "upper", outline.col = "white")
# Download data directory - Meta-Meta-Virome # This will download ~8.5K summary files for all viromes if ( !dir.exists('data/200528_viro') ){ getSummary(s3_path = 's3://serratus-public/out/200528_viro/summary', local_path = 'data/200528_viro') } # Download SraRunInfo File - Meta-Meta-Virome if ( !file.exists('sra/viro_SraRunInfo.csv') ){ getSraRunInfo(runInfo = "viro_SraRunInfo.csv", s3_path = "s3://serratus-public/out/200528_viro/", local_path = "sra") } # Download Sequence Reference - cov3ma if ( !file.exists('seq/cov3ma/cov3ma.Rdata') ){ getReference( refName = "cov3ma", s3_path = "s3://lovelywater2/seq/", local_path = "seq") }
# Parse Serratus .summary Files into Serratus Object # from a directory of summary data DATA_PATH='data/test/' sumFiles <- paste0( DATA_PATH, system( paste0("ls ", DATA_PATH), intern = T )) TEST <- readSerratus(sumFiles = sumFiles) # Parse SraRunInfo.csv file into a SraRunInfo Object sra_path='sra/viro_SraRunInfo.csv' RUNINFO <- readSraRunInfo(sra_path)
library("dplyr") library("ggplot2") load("data/viro.RData") families <- unique(ZOO@family$family) Coronaviridae <- ZOO@family %>% filter(family %in% c("Coronaviridae")) Coronaviridae <- Coronaviridae %>% filter(score >= 75) Coronaviridae <- Coronaviridae %>% filter(pctid <= 92) for (sra in Coronaviridae$sra){ getBAM(sra, s3_path = 's3://serratus-public/out/200528_viro') } ggplot(data=Coronaviridae, aes(Coronaviridae$aln)) + geom_histogram( bins = 100) + theme_bw() + scale_x_log10() + labs(x = "Aligned Reads") + labs(title = "Coronaviridae family") ggplot(data=Coronaviridae, aes(Coronaviridae$score)) + geom_histogram(fill="blue", bins = 100) + theme_bw() + labs(x = "Family score") + labs(title = "Coronaviridae family") p <- ggplot(data=Coronaviridae2, aes(x = score, y = pctid)) + geom_jitter(color="blue", alpha = 0.4) + theme_bw() + labs(x = "Family score", y = "Percentage identity") + labs(title = "Coronaviridae family") ggExtra::ggMarginal(p, type = "histogram") p <- ggplot(data=Coronaviridae3, aes(x = score, y = pctid)) + geom_jitter(color="blue", alpha = 0.4) + theme_bw() + labs(x = "Family score", y = "Percentage identity") + labs(title = "Coronaviridae family") ggExtra::ggMarginal(p, type = "histogram")
p1 <- ggplot(data=Coronaviridae, aes(pctid)) + geom_histogram( bins = 30) + theme_bw() + xlim(c(75,100)) + labs(x = "Percentage Identity") + labs(title = "Coronaviridae - all") p2 <- ggplot(data=Coronaviridae[which(Coronaviridae$score >= 50), ], aes(pctid)) + geom_histogram( bins = 30) + theme_bw() + xlim(c(75,100)) + labs(x = "Percentage Identity") + labs(title = "Coronaviridae - 50+") p3 <- ggplot(data=Coronaviridae[which(Coronaviridae$score >= 60), ], aes(pctid)) + geom_histogram( bins = 30) + theme_bw() + xlim(c(75,100)) + labs(x = "Percentage Identity") + labs(title = "Coronaviridae - 60+") p4 <- ggplot(data=Coronaviridae[which(Coronaviridae$score >= 70), ], aes(pctid)) + geom_histogram( bins = 30) + theme_bw() + xlim(c(75,100)) + labs(x = "Percentage Identity") + labs(title = "Coronaviridae - 70+") p5 <- ggplot(data=Coronaviridae[which(Coronaviridae$score >= 80), ], aes(pctid)) + geom_histogram( bins = 30) + theme_bw() + xlim(c(75,100)) + labs(x = "Percentage Identity") + labs(title = "Coronaviridae - 80+") p6 <- ggplot(data=Coronaviridae[which(Coronaviridae$score >= 90), ], aes(pctid)) + geom_histogram( bins = 30) + theme_bw() + xlim(c(75,100)) + labs(x = "Percentage Identity") + labs(title = "Coronaviridae - 90+") library(grid) grid.newpage() grid.draw(rbind(ggplotGrob(p1), ggplotGrob(p2), ggplotGrob(p3), ggplotGrob(p4), ggplotGrob(p5), ggplotGrob(p6), size = "last"))
library(fst) library(tantalus) library(stringr) # Serratus Data Release 200612 -- Family Hits DATA_PATH='data/200612_df/Acc/' sumWheres <- paste0( DATA_PATH, system( paste0("ls ", DATA_PATH), intern = T )) # See SRA Queries scRNA <- read.csv(file = 'data/scRNA_SraRunInfo.csv') scRNA <- scRNA$Run save(scRNA, file = "data/scRNA.RData") #read all columns COV <- readDFs(sumWheres, columns = c("sra","family","score","pctid","aln","cvg","top","topname"), family = "Coronaviridae") HDV <- readDFs(sumWheres, columns = c("sra","pctid","aln","cvg","acc","name"), acc = "NC_001653.2") HDV$no_read_bin <- str_count(HDV$cvg, "_") RATG13 <- readDFs(sumWheres, columns = c("sra","pctid","aln","cvg","acc","name"), acc = "MN996532.1" ) RATG13$no_read_bin <- str_count(RATG13$cvg, "_") HDV <- readDFs(sumWheres, columns = c("sra","pctid","aln","cvg","acc","name"), acc = "NC_001653.2") HDV$no_read_bin <- str_count(HDV$cvg, "_") SDV <- readDFs(sumWheres, columns = c("sra","pctid","aln","cvg","acc","name"), acc = "NC_040729.1") SDV$no_read_bin <- str_count(SDV$cvg, "_")
library(fst) library(tantalus) library(stringr) # Serratus Data Release 200612 -- Family Hits DATA_PATH='data/200612_df/F/' sumWheres <- paste0( DATA_PATH, system( paste0("ls ", DATA_PATH), intern = T )) POX <- readDFs(sumWheres, columns = c("sra","family","score","pctid","aln","cvg","top","topname"), family = "Poxviridae") POX2 <- POX[ POX$score > 90, ] POX3 <- POX[ POX$topname == 'Salmon gill poxvirus;', ] save(POX, POX2, file = '200626_POX.Rdata') write.csv(POX3, file ='200626_Salmon_POX_lhf.csv')
library(tantalus) load("data/cov.fam.200612.Rdata") load("data/scRNA.Rdata") scCOV <- COV$sra %in% scRNA # 417 248 COV <- COV[ !scCOV, ] # COV2 <- COV[ COV$score > 20, ] # COV2 <- COV2[ COV2$aln > 100, ]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.