library(gridExtra) library(knitr) library(ggplot2) #install.packages("fastqcr") library(fastqcr) #must run this if fastqc is not already installed locally #fastqc_install()
###ONLY THIS CHUNK REQUIRES MODIFICATION### ###assign your directory locations here: #specify full path to directory containing a .fastq.gz file for each sample fq.dir<-"/home/d669d153/work/dia.din/fq" #specify full path to the output directory where you want qc.dir<-"~/Downloads/qc" #run fastqc on all .fastq.gz files, through r #only needs to be run once, if tweaking downstream visualizations, comment out this step #fastqc(fq.dir = fq.dir, # FASTQ files directory # qc.dir = qc.dir, # Results directory # threads = 4 # Number of threads # )
# List of files in the output directory to ensure fastqc worked list.files(qc.dir) #create a character vector where each value is the full path to the .zip created by fastqc() for a given sample #samps<-list.files("/home/d669d153/work/dia.din/qc", full.names = T, pattern = "*.zip") samps<-list.files(qc.dir, full.names = T, pattern = "*.zip") #plot qc test results for each sample for (i in samps){ #read info for given sample from the .zip file generated in the previous step samp.info <- qc_read(i) #open blank list to hold qc visualizations for the given sample plot<-list() #do qc for the given sample plot[[1]]<-qc_plot(samp.info, "Basic statistics") plot[[2]]<-qc_plot(samp.info, "Per sequence quality scores") plot[[3]]<-qc_plot(samp.info, "Sequence duplication levels") #visualize tables print(paste0("QC results for sample ", gsub(".*/", "", i))) cat('\n') print(kable(plot[[1]])) cat('\n') #visualize plots grid.arrange(plot[[2]],plot[[3]], ncol=2) #clear plot to hold info for next sample rm(plot) }
#aggregate the reports by pointing this function to the folder holding output of fastqc() #qc <- qc_aggregate("/home/d669d153/work/dia.din/qc", progressbar=F) qc <- qc_aggregate(qc.dir, progressbar = F) #stats per sample knitr::kable(qc_stats(qc))
#save stats info as an object stats.info<-qc_stats(qc) #make tot.seq numeric stats.info$tot.seq<-as.numeric(stats.info$tot.seq) #make histogram of number of sequence reads for each sample ggplot(stats.info, aes(x=tot.seq))+ geom_histogram(color="black", fill="white", bins=20)+ geom_vline(aes(xintercept=median(tot.seq)), color = "red")+ geom_vline(aes(xintercept=median(tot.seq)*.1), color = "red", lty=14)+ theme_classic()+ xlab("Number of sequencing reads") #solid red line = median sample value #dashed red line = 10% of median sample value ggplot(stats.info, aes(x=tot.seq))+ geom_histogram(color="black", fill="white", bins=200)+ geom_vline(aes(xintercept=median(tot.seq)), color = "red")+ geom_vline(aes(xintercept=median(tot.seq)*.1), color = "red", lty=14)+ theme_classic()+ xlab("Number of sequencing reads") #show me the samples that have less than 10% of the number of reads as the median sample from this experiment (these should be dropped immediately) print(paste("Median sample contains", median(stats.info$tot.seq), "reads. The following samples contain less than", median(stats.info$tot.seq)*.1, "reads (10% of the median), and should likely be dropped")) knitr::kable(stats.info[stats.info$tot.seq < median(stats.info$tot.seq)*.1,])
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.