#library("RSQLite") #library("jsonlite") library("tantalus") library("ggplot2") library("gridExtra") library("dplyr") #for exploring library("plotly") library("grid") library("scales") library("viridis") library("ggpubr") library("fst")
#DATA_PATH='../200528_viro/' #DATA_PATH='../200525_vert/' #DATA_PATH='../200530_hu1/' #DATA_PATH='../200606_mamm/' DATA_PATH='../200606_hu2/' sumWheres <- paste0( DATA_PATH, system( paste0("ls ", DATA_PATH), intern = T )) #ZOO <- readSerratus( sumWheres ) #for (f in sumWheres){ #print(f) #test <- readSerratus( f ) #} start.time <- Sys.time() #ZOO <- readSerratus( sumWheres ) HU2 <- readSerratus( sumWheres ) end.time <- Sys.time() time.taken <- end.time - start.time time.taken #save(ZOO, file="../VIRO.RData") #save(VERT, file="../VERT.RData") #save(HU1, file="../HU1.RData") #save(MAMM, file="../MAMM.RData") #save(HU2, file="../HU2.RData") #load("../VIRO.RData") #load("../MAMM.RData") #load("../HU2.RData") #magor_problem <- c("../200528_viro/ERR2002982.summary") #x <- readSerratus( magor_problem ) MAMM <- readRDS("../Atys/mamm_atys.rds") MAMM <- MAMM@Serratus
#familyScore(MAMM, "Coronaviridae", log=T)
#x <- familyScorPctID(MAMM, "Coronaviridae")
#p1 <- familyPctID(MAMM, "Coronaviridae", score_threshold = 0, title="Coronaviridae - all", x1=70, x2=100, log=T) #p2 <- familyPctID(MAMM, "Coronaviridae", score_threshold = 50, title="Coronaviridae - 50+", x1=70, x2=100, log=T) #p3 <- familyPctID(MAMM, "Coronaviridae", score_threshold = 60, title="Coronaviridae - 60+", x1=70, x2=100, log=T) #p4 <- familyPctID(MAMM, "Coronaviridae", score_threshold = 70, title="Coronaviridae - 70+", x1=70, x2=100, log=T) #p5 <- familyPctID(MAMM, "Coronaviridae", score_threshold = 80, title="Coronaviridae - 80+", x1=70, x2=100, log=T) #p6 <- familyPctID(MAMM, "Coronaviridae", score_threshold = 90, title="Coronaviridae - 90+", x1=70, x2=100, log=T) #grid.newpage() #grid.draw(rbind(ggplotGrob(p1), # ggplotGrob(p2), # ggplotGrob(p3), # ggplotGrob(p4), # ggplotGrob(p5), # ggplotGrob(p6), # size = "last"))
pctidInScoreIntervals(MAMM@family, "Coronaviridae", title="Coronaviridae")
scoreInPctidIntervals(MAMM@family, "Coronaviridae", title="Coronaviridae")
# Parse SraRunInfo.csv file into a SraRunInfo Object sra_path='../sra/viro_SraRunInfo.csv' RUNINFO <- readSraRunInfo(sra_path) RUNINFO_vert <- readSraRunInfo('../sra/vert_sraRunInfo.csv') RUNINFO_hu1 <- readSraRunInfo('../sra/hu1_SraRunInfo.csv') RUNINFO_hu2 <- readSraRunInfo('../sra/hu2_sraRunInfo.csv') RUNINFO_mamm <- readSraRunInfo('../sra/mamm_SraRunInfo.csv') load("../VIRO.RData") load("../VERT.RData") load("../HU1.RData") load("../HU2.RData") load("../MAMM.RData") atys_viro <- Atys(Serratus = ZOO, SRA = RUNINFO, GENOME="") atys_vert <- Atys(Serratus = VERT, SRA = RUNINFO_vert, GENOME="") atys_hu1 <- Atys(Serratus = HU1, SRA = RUNINFO_hu1, GENOME="") atys_hu2 <- Atys(Serratus = HU2, SRA = RUNINFO_hu2, GENOME="") atys_mamm <- Atys(Serratus = MAMM, SRA = RUNINFO_mamm, GENOME="") saveRDS(atys_viro, "../Atys/viro_atys.rds") saveRDS(atys_vert, "../Atys/vert_atys.rds") saveRDS(atys_hu1, "../Atys/hu1_atys.rds") saveRDS(atys_hu2, "../Atys/hu2_atys.rds") saveRDS(atys_mamm, "../Atys/mamm_atys.rds") DATA='../Atys/' parseAtys(DATA) #######################################################################
sras <- c("SRR9701190", "SRR9705127", "SRR975462", "SRR9843091", "SRR9843092") serr <- getSummaryFiles(sras, 's3://lovelywater2/summary/', "summary_files")
#TEST DATA_PATH='../summary_test/' sumWheres <- paste0( DATA_PATH, system( paste0("ls ", DATA_PATH), intern = T )) SRA <- as.character(sapply(sumWheres, readSra)) #Time difference of 0.01752162 secs HEADER <- data.frame(t(sapply(sumWheres, readHeader))) #Time difference of 1.364799 secs HITS_Family <- lapply(sumWheres, readHits, family=T) #Time difference of 15.47495 secs HITS_Family <- do.call(rbind.data.frame, HITS_Family) #Time difference of 0.4746368 secs HITS_ACC <- lapply(sumWheres, readHits, family=F) #Time difference of 1.758523 mins HITS_ACC <- do.call(rbind.data.frame, HITS_ACC) #Time difference of 1.099728 secs start.time <- Sys.time() HITS_ACC <- do.call(rbind.data.frame, HITS_ACC) end.time <- Sys.time() time.taken <- end.time - start.time time.taken
VIRO - 8530 files VERT - 94453 files MAMM - 94743 MU - 264281 HUMAN - 651614 HUMAN_META - 35709
vert_sra <- readSraRunInfo("../sra/vert_SraRunInfo.csv") vert_serratus <- getLoadedSummaryFiles(vert_sra@runInfo$Run, "../summary_files/") vert_sra@runInfo <- vert_sra@runInfo %>% filter(Run %in% vert_serratus@header$sra) vert_atys <- Atys(Serratus = vert_serratus, SRA = vert_sra, GENOME="") print("Save to rds") saveRDS(vert_atys, "../Atys/vert_atys.rds")
viro2_sra <- readSraRunInfo("../sra/viro2_SraRunInfo.csv") #hu_m_sra <- readSraRunInfo("../sra/hu_meta_SraRunInfo.csv") #x <- rbind(hu_sra@runInfo, hu_m_sra@runInfo) #x <- x[!duplicated(x$Run), ] #hu_sra@runInfo <- x #rm(hu_m_sra) #rm(x) sumWheres_all <- paste("../summary_viro/", system( paste0("ls ", "../summary_viro/"), intern = T), sep="/") summary_files <- paste(viro2_sra@runInfo$Run, "summary", sep=".") sumWheres <- paste("../summary_viro/", summary_files, sep="/") #remove files that are absent in the specified folder files_to_analyze <- intersect(sumWheres_all, sumWheres) parseSummaryFiles(files_to_analyze, outputFolder="../viro2/viro2_F/", parseFamily=T) parseSummaryFiles(files_to_analyze, outputFolder="../viro2/viro2_Acc/", parseFamily=F) #read all family dfs DATA_PATH='../viro2_dfs/viro2_F/' sumWheres <- paste0( DATA_PATH, system( paste0("ls ", DATA_PATH), intern = T )) #read subset of columns viro2 <- readDFs(sumWheres, all=F, columns = c("sra","score", "pctid", "family")) #plot fscore histogram familyScore(viro2, "Coronaviridae", log=T) #plot fscore/pctid histogram/scatterplot x <- familyScorPctID(viro2, "Coronaviridae") x #plot pctid histograms library("grid") p1 <- familyPctID(viro2, "Coronaviridae", score_threshold = 0, title="Coronaviridae - all", x1=70, x2=100, log=T) p2 <- familyPctID(viro2, "Coronaviridae", score_threshold = 50, title="Coronaviridae - 50+", x1=70, x2=100, log=T) p3 <- familyPctID(viro2, "Coronaviridae", score_threshold = 60, title="Coronaviridae - 60+", x1=70, x2=100, log=T) p4 <- familyPctID(viro2, "Coronaviridae", score_threshold = 70, title="Coronaviridae - 70+", x1=70, x2=100, log=T) p5 <- familyPctID(viro2, "Coronaviridae", score_threshold = 80, title="Coronaviridae - 80+", x1=70, x2=100, log=T) p6 <- familyPctID(viro2, "Coronaviridae", score_threshold = 90, title="Coronaviridae - 90+", x1=70, x2=100, log=T) grid.newpage() grid.draw(rbind(ggplotGrob(p1), ggplotGrob(p2), ggplotGrob(p3), ggplotGrob(p4), ggplotGrob(p5), ggplotGrob(p6), size = "last")) pctidInScoreIntervals(viro2, "Coronaviridae", title="Coronaviridae") scoreInPctidIntervals(viro2, "Coronaviridae", title="Coronaviridae")
############################################################################################### DATA_PATH='../vert_dfs/temp_folder_vert_F/' sumWheres <- paste0( DATA_PATH, system( paste0("ls ", DATA_PATH), intern = T )) start.time <- Sys.time() hits <- lapply(sumWheres, function(x){ df_subset <- read.fst(x, c("sra","score", "pctid", "family")) return(df_subset) }) suppressWarnings(y <- bind_rows(hits)) end.time <- Sys.time() time.taken <- end.time - start.time time.taken #familyScore(HU2, "Coronaviridae", log=T) family_df <- y %>% filter(family %in% c("Coronaviridae")) p <- ggplot(data=family_df, aes(family_df$score)) + geom_histogram(fill="blue", bins = 30) + theme_bw() + scale_y_log10() + coord_cartesian(xlim=c(0,100)) + labs(x = "Family score") + labs(title = paste("Coronaviridae", "family (log10 y axis)", sep = " ")) #x <- familyScorPctID(HU2, "Coronaviridae") p <- ggplot(data=family_df, aes(x = score, y = pctid)) + geom_jitter(color="blue", alpha = 0.4) + theme_bw() + labs(x = "Family score", y = "Percentage identity") + labs(title = paste("Coronaviridae", "family", sep = " ")) s <- ggExtra::ggMarginal(p, type = "histogram") p1 <- ggplot(data=family_df[which(family_df$score >= 0), ], aes(pctid)) + geom_histogram(fill="blue", bins = 30) + theme_bw() + coord_cartesian(xlim=c(70,100)) + scale_y_log10() + labs(x = "Percentage Identity") + labs(title = "Coronaviridae - all") p2 <- ggplot(data=family_df[which(family_df$score >= 50), ], aes(pctid)) + geom_histogram(fill="blue", bins = 30) + theme_bw() + coord_cartesian(xlim=c(70,100)) + scale_y_log10() + labs(x = "Percentage Identity") + labs(title = "Coronaviridae - 50+") p3 <- ggplot(data=family_df[which(family_df$score >= 60), ], aes(pctid)) + geom_histogram(fill="blue", bins = 30) + theme_bw() + coord_cartesian(xlim=c(70,100)) + scale_y_log10() + labs(x = "Percentage Identity") + labs(title = "Coronaviridae - 60+") p4 <- ggplot(data=family_df[which(family_df$score >= 70), ], aes(pctid)) + geom_histogram(fill="blue", bins = 30) + theme_bw() + coord_cartesian(xlim=c(70,100)) + scale_y_log10() + labs(x = "Percentage Identity") + labs(title = "Coronaviridae - 70+") p5 <- ggplot(data=family_df[which(family_df$score >= 80), ], aes(pctid)) + geom_histogram(fill="blue", bins = 30) + theme_bw() + coord_cartesian(xlim=c(70,100)) + scale_y_log10() + labs(x = "Percentage Identity") + labs(title = "Coronaviridae - 80+") p6 <- ggplot(data=family_df[which(family_df$score >= 90), ], aes(pctid)) + geom_histogram(fill="blue", bins = 30) + theme_bw() + coord_cartesian(xlim=c(70,100)) + scale_y_log10() + labs(x = "Percentage Identity") + labs(title = "Coronaviridae - 90+") grid.newpage() grid.draw(rbind(ggplotGrob(p1), ggplotGrob(p2), ggplotGrob(p3), ggplotGrob(p4), ggplotGrob(p5), ggplotGrob(p6), size = "last"))
#pctidInScoreIntervals(HU2, "Coronaviridae", title="Coronaviridae in HU2") test <- family_df$score test2 <- sapply(test, function(x){ if ((x <= 100) & (x > 95)){ return("95-100") } else if ((x <= 95) & (x > 85)){ return("85-95") } else if ((x <= 85) & (x > 75)){ return("75-85") } else if ((x <= 75) & (x > 65)){ return("65-75") } else if ((x <= 65) & (x > 55)){ return("55-65") } else if ((x <= 55) & (x > 45)){ return("45-55") } else if ((x <= 45) & (x > 35)){ return("35-45") } else if ((x <= 35) & (x > 25)){ return("25-35") } else if ((x <= 25) & (x > 15)){ return("15-25") } else if ((x <= 15) & (x > 5)){ return("5-15") } else { return("0-5") } }) test2 <- factor(test2, levels = c("0-5", "5-15", "15-25", "25-35", "35-45", "45-55", "55-65", "65-75", "75-85", "85-95", "95-100")) family_df$score_intervals <- test2 p <- ggplot(family_df %>% group_by(pctid,score_intervals) %>% summarize(n=n(), log10n = log10(n() + 1)), aes(pctid, log10n,fill = score_intervals)) + geom_bar(color="black",stat = "identity") + scale_fill_viridis(discrete=TRUE) + theme_bw() + labs(title = "Coronaviridae - Vert")
#scoreInPctidIntervals(HU2, "Coronaviridae", title="Coronaviridae in HU2") test <- family_df$pctid test2 <- sapply(test, function(x){ if ((x <= 100) & (x > 97)){ return("97-100") } else if ((x <= 97) & (x > 94)){ return("94-97") } else if ((x <= 94) & (x > 91)){ return("91-94") } else if ((x <= 91) & (x > 88)){ return("88-91") } else if ((x <= 88) & (x > 85)){ return("85-88") } else if ((x <= 85) & (x > 82)){ return("82-85") } else { return("<82") } }) test2 <- factor(test2, levels = c("<82", "82-85", "85-88", "88-91", "91-94", "94-97", "97-100")) family_df$pctid_intervals <- test2 p <- ggplot(family_df %>% group_by(score,pctid_intervals) %>% summarize(n=n(), log10n = log10(n() + 1)), aes(score, log10n,fill = pctid_intervals)) + geom_bar(color="black",stat = "identity") + scale_fill_viridis(discrete=TRUE) + theme_bw() + labs(title = "Coronaviridae - Vert")
#sra_names <- c("DRR000897", "DRR001151", "DRR001152") #test <- getLoadedSummaryFiles(sra_names, "../summary_files/") ####################################################################### viro_sra <- readSraRunInfo("../sra/viro_SraRunInfo.csv") viro_sra <- viro_sra@runInfo[,c("Run","size_MB","spots", "spots_with_mates")] mu_sra <- readSraRunInfo("../sra/mu_SraRunInfo.csv") mu_sra <- mu_sra@runInfo[,c("Run","size_MB","spots", "spots_with_mates")] vert_sra <- readSraRunInfo("../sra/vert_SraRunInfo.csv") vert_sra <- vert_sra@runInfo[,c("Run","size_MB","spots", "spots_with_mates")] mamm_sra <- readSraRunInfo("../sra/mamm_SraRunInfo.csv") mamm_sra <- mamm_sra@runInfo[,c("Run","size_MB","spots", "spots_with_mates")] hu_sra <- readSraRunInfo("../sra/hu_SraRunInfo.csv") hu_sra <- hu_sra@runInfo[,c("Run","size_MB","spots", "spots_with_mates")] hu_meta_sra <- readSraRunInfo("../sra/hu_meta_SraRunInfo.csv") hu_meta_sra <- hu_meta_sra@runInfo[,c("Run","size_MB","spots", "spots_with_mates")] all_sras <- rbind(viro_sra, vert_sra, mu_sra, mamm_sra, hu_sra, hu_meta_sra) all_sras <- all_sras[!duplicated(all_sras$Run), ] #filter files ################################################### #sumWheres_all <- paste("../summary_files/", system( paste0("ls ", "../summary_files/"), intern = T), sep="/") sumWheres_all <- system( paste0("ls ", "../summary_files/"), intern = T) sumWheres_all <- sapply(sumWheres_all, function(x) { unlist(strsplit(x, ".", fixed = TRUE))[1] }) #summary_files <- paste(all_sras$Run, "summary", sep=".") #sumWheres <- paste("../summary_files/", summary_files, sep="/") #remove files that are absent in the specified folder #files_to_analyze <- intersect(sumWheres_all, sumWheres) all_sras <- all_sras %>% filter(Run %in% sumWheres_all) ################################################ #number of unique sras: nsras <- nrow(all_sras) nsras <- as.character(nsras) #sums of mb used (and converted to gb) mbs <- sum(all_sras$size_MB) gbs <- mbs / 1000 gbs <- as.character(gbs) #how many nucleotides were analyzed spots_sum <- sum(all_sras$spots) spots_with_mates_sum <- sum(all_sras$spots_with_mates) nucl_sum <- spots_sum + spots_with_mates_sum #nucl_sum <- format(nucl_sum, scientific=FALSE) nucl_sum <- as.character(nucl_sum) res_lis <- list(nsras, gbs, nucl_sum) names(res_lis) <- c("Num_SRAs", "GBs", "Nucl_SUM") res_lis
#NEW DATA SET - 3.8 M ####################################################################### viro_sra <- readSraRunInfo("../new_sra/viro_SraRunInfo.csv") viro_sra <- viro_sra@runInfo[,c("Run","size_MB","spots", "spots_with_mates")] mu_sra <- readSraRunInfo("../new_sra/mu_SraRunInfo.csv") mu_sra <- mu_sra@runInfo[,c("Run","size_MB","spots", "spots_with_mates")] vert_sra <- readSraRunInfo("../new_sra/vert_SraRunInfo.csv") vert_sra <- vert_sra@runInfo[,c("Run","size_MB","spots", "spots_with_mates")] mamm_sra <- readSraRunInfo("../new_sra/mamm_SraRunInfo.csv") mamm_sra <- mamm_sra@runInfo[,c("Run","size_MB","spots", "spots_with_mates")] hu_sra <- readSraRunInfo("../new_sra/hu_SraRunInfo.csv") hu_sra <- hu_sra@runInfo[,c("Run","size_MB","spots", "spots_with_mates")] hu_meta_sra <- readSraRunInfo("../new_sra/hu_meta_SraRunInfo.csv") hu_meta_sra <- hu_meta_sra@runInfo[,c("Run","size_MB","spots", "spots_with_mates")] bat_sra <- readSraRunInfo("../new_sra/bat_SraRunInfo.csv") bat_sra <- bat_sra@runInfo[,c("Run","size_MB","spots", "spots_with_mates")] inv_sra <- readSraRunInfo("../new_sra/inv_SraRunInfo.csv") inv_sra <- inv_sra@runInfo[,c("Run","size_MB","spots", "spots_with_mates")] all_sras <- rbind(viro_sra, vert_sra, mu_sra, mamm_sra, hu_sra, hu_meta_sra, bat_sra, inv_sra) all_sras <- all_sras[!duplicated(all_sras$Run), ] #filter files ################################################### #sumWheres_all <- system( paste0("ls ", "../summary_files/"), intern = T) new_summary <- read.table("../new_summary_files.txt", stringsAsFactors = F) new_summary_files <- new_summary$V4 sumWheres_all <- sapply(new_summary_files, function(x) { unlist(strsplit(x, ".", fixed = TRUE))[1] }) all_sras <- all_sras %>% filter(Run %in% sumWheres_all) ################################################ hu_sra_present <- hu_sra %>% filter(Run %in% all_sras$Run) hu_sra_absent <- hu_sra %>% filter(!Run %in% all_sras$Run) #plotting the size distribution ################################################### library(plotly) fig <- plot_ly(alpha = 0.6) fig <- fig %>% add_histogram(x = hu_sra$size_MB, name = "humans") fig <- fig %>% add_histogram(x = hu_sra_present$size_MB, name = "available humans") fig <- fig %>% layout(xaxis = list(title="MBs"))#, range = c(0, 50000))) fig <- fig %>% layout(yaxis = list(type = "log")) fig <- fig %>% layout(xaxis = list(type = "log")) fig <- fig %>% layout(barmode = "overlay") fig <- plot_ly(y = log10(hu_sra$size_MB+1), type = "box", name="humans") fig <- fig %>% add_trace(y = log10(hu_sra_present$size_MB+1), type="box", name="available humans") fig <- fig %>% add_trace(y = log10(hu_sra_absent$size_MB+1), type="box", name="absent humans") density1 <- density(hu_sra$size_MB) density2 <- density(hu_sra_present$size_MB) density3 <- density(hu_sra_absent$size_MB) fig <- plot_ly(x = ~density1$x, y = ~density1$y, type = 'scatter', mode = 'lines', name = 'All humans', fill = 'tozeroy') fig <- fig %>% add_trace(x = ~density2$x, y = ~density2$y, name = 'Available humans', fill = 'tozeroy') fig <- fig %>% add_trace(x = ~density3$x, y = ~density3$y, name = 'Missing humans', fill = 'tozeroy') fig <- fig %>% layout(xaxis = list(title = 'MBs'), yaxis = list(title = 'Density')) fig <- fig %>% layout(xaxis = list(type = "log")) #the bump bump <- hu_sra_absent %>% filter(size_MB > 500 & size_MB < 1000) saveRDS(bump, file = "../human_absent_bump.rds") #################################################### all_hum_size <- sum(hu_sra$size_MB) gbs_all <- all_hum_size/1000 missing_hum_size <- sum(hu_sra_absent$size_MB) gbs_missing <- missing_hum_size/1000 ################################################### #number of unique sras: nsras <- nrow(all_sras) nsras <- as.character(nsras) #sums of mb used (and converted to gb) mbs <- sum(all_sras$size_MB) gbs <- mbs / 1000 gbs <- as.character(gbs) #how many nucleotides were analyzed spots_sum <- sum(all_sras$spots) spots_with_mates_sum <- sum(all_sras$spots_with_mates) nucl_sum <- spots_sum + spots_with_mates_sum #nucl_sum <- format(nucl_sum, scientific=FALSE) nucl_sum <- as.character(nucl_sum) res_lis <- list(nsras, gbs, nucl_sum) names(res_lis) <- c("Num_SRAs", "GBs", "Nucl_SUM") res_lis
Building an OTU-SRA correspondance file:
otus <- read.table("../data/otus92.tsv", sep="\t", stringsAsFactors = F) colnames(otus) <- c("OTU", "NAME") cov3ma <- read.table("../data/cov3ma.sumzer.tsv", sep="\t", stringsAsFactors = F, quote = "", comment.char = "$")
################################################################################################################ #VIRO ################################################################################################################ VIRO <- readRDS("../Atys/viro_atys.rds") VIRO <- VIRO@Serratus Coron_viro <- VIRO@family %>% filter(family %in% c("Coronaviridae")) %>% filter(score >= 80) %>% filter(pctid >= 95) accessions <- as.character(Coron_viro$top) accessions <- sapply(accessions, function(x) { unlist(strsplit(x, "[.]"))[1] }) Coron_viro$NAME <- accessions Coron_viro_short <- Coron_viro %>% dplyr::select("sra", "NAME") otus_viro <- merge(Coron_viro_short, otus, by="NAME") rm(VIRO) ################################################################################################################ #MAMM ################################################################################################################ MAMM <- readRDS("../Atys/mamm_atys.rds") MAMM <- MAMM@Serratus Coron_mamm <- MAMM@family %>% filter(family %in% c("Coronaviridae")) %>% filter(score >= 80) %>% filter(pctid >= 95) accessions <- as.character(Coron_mamm$top) accessions <- sapply(accessions, function(x) { unlist(strsplit(x, "[.]"))[1] }) Coron_mamm$NAME <- accessions Coron_mamm_short <- Coron_mamm %>% dplyr::select("sra", "NAME") otus_mamm <- merge(Coron_mamm_short, otus, by="NAME") rm(MAMM) ################################################################################################################ #VERT ################################################################################################################ DATA_PATH='../vert_dfs/temp_folder_vert_F/' sumWheres <- paste0( DATA_PATH, system( paste0("ls ", DATA_PATH), intern = T )) hits <- lapply(sumWheres, function(x){ df_subset <- read.fst(x, c("sra","score", "pctid", "family", "top")) return(df_subset) }) suppressWarnings(VERT <- bind_rows(hits)) Coron_vert <- VERT %>% filter(family %in% c("Coronaviridae")) %>% filter(score >= 80) %>% filter(pctid >= 95) rm(VERT) accessions <- as.character(Coron_vert$top) accessions <- sapply(accessions, function(x) { unlist(strsplit(x, "[.]"))[1] }) Coron_vert$NAME <- accessions Coron_vert_short <- Coron_vert %>% dplyr::select("sra", "NAME") otus_vert <- merge(Coron_vert_short, otus, by="NAME") ################################################################################################################ #MOUSE ################################################################################################################ DATA_PATH='../mu_dfs/temp_folder_mu_F/' sumWheres <- paste0( DATA_PATH, system( paste0("ls ", DATA_PATH), intern = T )) hits <- lapply(sumWheres, function(x){ df_subset <- read.fst(x, c("sra","score", "pctid", "family", "top")) return(df_subset) }) suppressWarnings(MU <- bind_rows(hits)) Coron_mu <- MU %>% filter(family %in% c("Coronaviridae")) %>% filter(score >= 80) %>% filter(pctid >= 95) rm(MU) accessions <- as.character(Coron_mu$top) accessions <- sapply(accessions, function(x) { unlist(strsplit(x, "[.]"))[1] }) Coron_mu$NAME <- accessions Coron_mu_short <- Coron_mu %>% dplyr::select("sra", "NAME") otus_mu <- merge(Coron_mu_short, otus, by="NAME") ################################################################################################################ #HUMAN ################################################################################################################ DATA_PATH='../hu_dfs/temp_folder_hu_F/' sumWheres <- paste0( DATA_PATH, system( paste0("ls ", DATA_PATH), intern = T )) hits <- lapply(sumWheres, function(x){ df_subset <- read.fst(x, c("sra","score", "pctid", "family", "top")) return(df_subset) }) suppressWarnings(HU <- bind_rows(hits)) Coron_hu <- HU %>% filter(family %in% c("Coronaviridae")) %>% filter(score >= 80) %>% filter(pctid >= 95) rm(HU) accessions <- as.character(Coron_hu$top) accessions <- sapply(accessions, function(x) { unlist(strsplit(x, "[.]"))[1] }) Coron_hu$NAME <- accessions Coron_hu_short <- Coron_hu %>% dplyr::select("sra", "NAME") otus_hu <- merge(Coron_hu_short, otus, by="NAME")
Count number of datasets per otu (species):
otus_datasets <- rbind(otus_viro, otus_vert, otus_mamm, otus_mu, otus_hu) otus_datasets <- unique(otus_datasets) otus_species <- unique(otus$OTU) test <- sapply(otus_species, function(x) { otus_datasets %>% filter(OTU == x) %>% nrow() }) res_abundance <- data.frame(OTU = as.character(), Number_of_datasets = as.numeric()) for (otu_num in otus_species){ name_of_otu <- paste("OTU", otu_num, sep="") num_of_datasets <- otus_datasets %>% filter(OTU == otu_num) %>% nrow() interim_res <- data.frame(OTU = name_of_otu, Number_of_datasets = num_of_datasets) res_abundance <- rbind(res_abundance, interim_res) } res_abundance <- res_abundance[order(-res_abundance$Number_of_datasets),] write.table(res_abundance, file="../data/otu_abundance_all.tsv", quote=F, sep="\t", row.names = F)
#viro2_dfs #NC_014468.1 - example accession <- "NC_014468.1" DATA_PATH='../viro2_dfs/viro2_Acc/' sumWheres <- paste0( DATA_PATH, system( paste0("ls ", DATA_PATH), intern = T )) fig <- getHeatmap(accession, sumWheres)
#install.packages("heatmaply") #library(heatmaply) #heatmaply(scale(mtcars), k_row = 3, k_col = 2)
library("dbplyr") library("RPostgreSQL") drv <- DBI::dbDriver("PostgreSQL") con <- DBI::dbConnect(drv, user="postgres", password="serratus", host="big-parse-db.ccz9y6yshbls.us-east-1.rds.amazonaws.com", port=5432, dbname="postgres") vtax <- read.table( file = "data/virus_taxid.tsv", header = TRUE) levels(vtax$family_name)[27] = "Unknown" RNAtaxa <- 1:26 # RNA viruses in vtax rna_families <- vtax$family_name[RNAtaxa]
#convert database output into dataframe CoV <- readDfSQL(con, "FamilySections", family = "Coronaviridae", columns = c("Sra","Score", "PctId", "Family"), score = 64, dataframe = T) OrV <- readDfSQL(con, "FamilySections", family = "Orthomyxoviridae", columns = c("Sra","Score", "PctId", "Family"), score = 24, dataframe = T) Para <- readDfSQL(con, "FamilySections", family = "Paramyxoviridae", columns = c("Sra","Score", "PctId", "Family"), score = 24, dataframe = T) Nairo <- readDfSQL(con, "FamilySections", family = "Nairoviridae", columns = c("Sra","Score", "PctId", "Family"), score = 24, dataframe = T) colnames(CoV) <- tolower(colnames(CoV)) colnames(OrV) <- tolower(colnames(OrV)) colnames(Para) <- tolower(colnames(Para)) colnames(Nairo) <- tolower(colnames(Nairo)) p <- pctidInScoreIntervals(CoV, "Coronaviridae", title="Coronaviridae", scale_log = F, bin_scores = F) d <- pctidInScoreIntervals(OrV, "Orthomyxoviridae", title="Orthomyxoviridae", scale_log = F, bin_scores = F) s <- pctidInScoreIntervals(Para, "Paramyxoviridae", title="Paramyxoviridae", scale_log = F, bin_scores = F) n <- pctidInScoreIntervals(Nairo, "Nairoviridae", title="Nairoviridae", scale_log = F, bin_scores = F)
#working with facet_wrap drv <- DBI::dbDriver("PostgreSQL") con <- DBI::dbConnect(drv, user="postgres", password="serratus", host="big-parse-db.ccz9y6yshbls.us-east-1.rds.amazonaws.com", port=5432, dbname="postgres") vtax <- read.table( file = "data/virus_taxid.tsv", header = TRUE) levels(vtax$family_name)[27] = "Unknown" RNAtaxa <- 1:26 # RNA viruses in vtax DNAtaxa <- 27:42 rna_families <- vtax$family_name[RNAtaxa] dna_families <- vtax$family_name[DNAtaxa] RNA_spec <- readDfSQL(con, "FamilySections", family = rna_families, columns = c("Sra","Score", "PctId", "Family"), score = 64, dataframe = T) DNA_spec <- readDfSQL(con, "FamilySections", family = dna_families, columns = c("Sra","Score", "PctId", "Family"), score = 64, dataframe = T) colnames(RNA_spec) <- tolower(colnames(RNA_spec)) colnames(DNA_spec) <- tolower(colnames(DNA_spec)) p <- pctidInScoreIntervals(RNA_spec, family_name = NULL, title="", scale_log = F, bin_scores = F) p + facet_wrap(~ family, ncol=5, scales="free_y")
d <- pctidInScoreIntervals(DNA_spec, family_name = NULL, title="", scale_log = F, bin_scores = F) d + facet_wrap(~ family, scales="free_y")
library("tantalus") library("dbplyr") library("RPostgreSQL") drv <- DBI::dbDriver("PostgreSQL") con <- DBI::dbConnect(drv, user="postgres", password="serratus", host="big-parse-db.ccz9y6yshbls.us-east-1.rds.amazonaws.com", port=5432, dbname="postgres") #with a database ##tbls: __EFMigrationsHistory, AccessionSections, FamilySections, FastaSections, Runs #get Family table, and Coronaviridae family (and get just specific columns) x <- readDfSQL(con, "FamilySections", family = "Coronaviridae", dataframe = T, columns = c("Sra","Score", "PctId", "Family")) #get specific SRAs sras <- c("SRR10144611", "SRR6906297", "SRR6906298", "SRR6906299", "SRR6906300", "SRR6906303", "SRR3229029", "SRR3229077", "SRR3229078", "SRR3229081") x_specific <- readDfSQL(con, "FamilySections", family = "Coronaviridae", sras = sras, dataframe = T, columns = c("Sra","Score", "PctId", "Family"))
colnames(x) <- tolower(colnames(x)) #plot pctid histogram with family score intervals p <- pctidInScoreIntervals(x, family_name = "Coronaviridae", title="Coronaviridae", scale_log = F, bin_scores = F) p
#plot pctid histogram with family score intervals p <- scoreInPctidIntervals(x, family_name = "Coronaviridae", title="Coronaviridae", scale_log = F, bin_percent = T) p
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.