# Copyright 2018 Kliebenstein Lab - UC Davis Plant Sciences. All rights reserved.
#
# You can redistribute it and/or modify it under the terms of the
# GNU General Public License Version 2. You should have received a copy of the
# GNU General Public License Version 2 along with Octopus project.
# If not, you can get one at https://github.com/WeiZhang317/octopus
#
#' Extract Short Reads
#'
#' Extract short reads from \code{seq_files} using \href{http://bowtie-bio.sourceforge.net/manual.shtml}{bowtie}.\cr
#' Output 2 files, \code{dist_dir}/orig.reads.csv and \code{dist_dir}/orig.reads.info.csv \cr
#' Remove \code{dist_dir}/reusing/ folder if don't want reuse data from that folder \cr
#' Setup ./tmp/ as an RAM disk folder will avoid lots of disk IOs, speed things up and protect you SSD.\cr
#' Example for linux :
#' \itemize{
#' \item cd to curent folder.
#' \item create ram disk in console : mount -t tmpfs -o size=4g tmpfs ./tmp/
#' }
#' Example for mac :
#' \itemize{
#' \item cd to curent folder.
#' \item mkdir -p tmp
#' \item sudo mount -t tmpfs -o size=4096M tmpfs ./tmp/
#' }
#' For windows, there are a number of RAM disk softerwares you can use.
#'
#' @param seq_files Sequencing files , accepts .fastq or .gz format for files.\cr
#' If \code{seq_files} is a \code{List} or \code{Vector}, index of \code{seq_files} is assumed to be sampe names of sequencing files.\cr
#' If \code{seq_files} is a \code{data.frame}, row.names is assumed to be sampe names of sequencing files, the first column is assumed to be sequencing files, when \code{type} is \code{paired} the second column is assumed to be the second mate pair sequences.
#' @param references A comma-separated list of FASTA files containing the reference sequences to be aligned to
#' @param type Could be one of c("single", "paired", "crossbow").\cr
#' If single, the input sequences are interpreted as single reads.\cr
#' If paired, they are supposed to be mate pair reads.\cr
#' If crossbow, they are considered to be Crossbow-style reads.
#' @param ... Additional arguments to be passed on to the binaries. See ... of \link[Rbowtie]{bowtie}
#' @param dist_dir folder for result file orig.reads.csv and orig.reads.info.csv
#' @export
#'
#' @examples --------------------
octopus.short_reads <- function(seq_files,references,...,type="single",dist_dir="results/"){
# check input data
if (is.data.frame(seq_files)){
sample_names <- row.names(seq_files)
if(type=="paired") {
sample_seq_files <- seq_files[,c(1,2)]
}else{
sample_seq_files <- seq_files[,1]
}
}else if(is.vector(seq_files) || is.list(seq_files) ){
sample_seq_files <- seq_files
sample_names <- 1:length(seq_files)
if(type=="paired") {
stop("")
}
}else{
print("seq_files must be one of vector/list/dta.frame type.")
return(NULL)
}
# create directory for results
short_reads_path <- paste0(dist_dir,"reusing/short_reads/")
dir.create(short_reads_path, showWarnings = FALSE, recursive = TRUE )
dir.create("tmp", showWarnings = FALSE, recursive = TRUE )
refs_sign_str <- NULL
for (str in references) {
if(is.null(refs_sign_str)){
refs_sign_str <- readChar(str,10000000)
}else{
refs_sign_str <- c(refs_sign_str,readChar(str,10000000))
}
}
# detect changing of input data
signs <- list(refs=sha1(refs_sign_str),seq_files=sha1(seq_files),ext_args=sha1(list(...)),type=type) # TODO list(2) -> list(...)
signs_file <- paste0(short_reads_path,"sha1_signatures")
if(!file.exists(signs_file)){ write.csv(signs,signs_file,row.names = FALSE) }
signs_old <- read.csv(signs_file)
is_mismatch = signs$refs != signs_old$refs || signs$seq_files != signs_old$seq_files || signs$ext_args != signs_old$ext_args || signs$type != signs_old$type
if(is_mismatch){
warning("parameter has been changed...
remove folder ",short_reads_path," and restart if changes can affect results.
remove ",signs_file," and restart to stop this message if changes won't affect results.")
}
mismatch <- function(){
if(!is_mismatch){ return() }
print("parameter has been changed...")
print(paste("remove folder ",short_reads_path," and restart if changes can affect results."))
print(paste("remove file ",signs_file," and restart to stop this message if changes won't affect results."))
}
mismatch()
# build index
refs <- octopus.bowtie_build(references,dist_dir)
# alignment
for(i in 1:length(sample_names)){
sample_name <- sample_names[i]
if(type=="paired") {
sample_seq_file <- sample_seq_files[i,]
} else {
sample_seq_file <- sample_seq_files[i]
}
print(paste("process sample",sample_name,", seq_file",sample_seq_file))
reads_file <- paste0(short_reads_path,sample_name,".reads.csv")
if(file.exists(reads_file)) { # try reuse
print(paste("reusing sample",sample_name,", file ",reads_file))
mismatch()
}else{ #align
ret <- octopus.bowtie_align(sample_seq_file,refs,type=type,...)
colnames(ret) <- c("gene",sample_name) #set column name
print(paste("saving into file ",reads_file))
write.csv(ret,file=reads_file,row.names=FALSE)
}
}
# merge
octopus.short_reads.merge(short_reads_path,dist_dir)
}
octopus.short_reads.merge <- function(src_dir,dist_dir="results/"){
print(paste("start merging reads files in folder ",src_dir))
orig.reads = NULL
files <- list.files(src_dir, pattern="\\.reads\\.csv$", recursive = TRUE , all.files = FALSE)
for(f in files) {
f <- paste0(src_dir,"/",f)
ret <- read.csv(f,check.names = FALSE)
colnames(ret)
if (is.null(orig.reads)) {
orig.reads <- ret
}else{
orig.reads <- merge(orig.reads,ret,all=T,by="gene")
}
names <- names(orig.reads)
print(paste("merge" , f , " into orig.reads. names [",names[1],"...",names[length(names)],"]"))
}
if(is.null(orig.reads )){ octopus.info("orig.reads is empty ...") ; return(FALSE) }
ret_map <- orig.reads[orig.reads$gene!="*",]
print(paste0("saving results to ",dist_dir,"orig.reads.csv"))
dir.create(dist_dir, showWarnings = FALSE, recursive = TRUE )
write.csv(ret_map,file=paste0(dist_dir,"orig.reads.csv"),row.names=FALSE)
print(paste("summarize mapped and unmapped reads ..."))
unmapped <- orig.reads[orig.reads$gene=="*",-1]
unmapped <- t(unmapped)
colnames(unmapped) <- "unmapped"
mapped <- apply(ret_map[-1],2,sum,na.rm=T)
mapped <- data.frame(mapped)
mapped_percent <- round(mapped/(mapped+unmapped)*100,1)
mapped_percent <- cbind(rownames(mapped_percent),mapped_percent[,1])
colnames(mapped_percent) <- c("SampleName","Qulity")
orig.reads.info <- data.frame(mapped_percent,mapped,unmapped)
print(paste0("saving qulity info to ",dist_dir,"orig.reads.info.csv"))
write.csv(orig.reads.info,file=paste0(dist_dir,"orig.reads.info.csv"),row.names=FALSE)
print(paste("merging done!!!"))
}
octopus.bowtie_build <- function(references,dist_dir="results/"){
# sha1 signature
# build index
refs_sign_str <- NULL
for (str in references) {
if(is.null(refs_sign_str)){
refs_sign_str <- readChar(str,10000000)
}else{
refs_sign_str <- c(refs_sign_str,readChar(str,10000000))
}
}
refs_sign <- sha1(refs_sign_str)
refs <- paste0(dist_dir,"/reusing/refs/",refs_sign)
if(dir.exists(refs)){
print(paste("reusing bowtie index ",refs," for references ",references))
}else{
print(paste("build bowtie index for references ",references))
refs_tmp <- paste0(refs,"_tmp")
bowtie_build(references=references, outdir=refs_tmp, force=TRUE)
file.rename(refs_tmp,refs)
}
# use ram disk for bowtie index
refs_ram <- "tmp/refs"
dir.create(refs_ram, showWarnings = FALSE, recursive = TRUE )
file.copy(paste0(refs,"/",list.files(refs)),paste0(refs_ram,"/",list.files(refs)),overwrite = TRUE)
paste0(refs_ram,"/index")
}
octopus.bowtie_align <- function(sample_seq_file,refs,...,type="single"){
# unzip
tmp_files <- NULL
if(type=="paired") {
# sample_seq_file <- sample_seq_files[1,]
sample_seq_file_pair <- as.character(sample_seq_file[2])
if(file_ext(sample_seq_file_pair) == "gz"){# unzip
fastq_file_pair <- "tmp/orig_pair.fastq"
gunzip(sample_seq_file_pair,fastq_file_pair,overwrite=TRUE,remove=FALSE)
tmp_files <- c(tmp_files,fastq_file_pair)
}else{
if(file_ext(sample_seq_file_pair) != "fastq"){ octopus.info("expecting .gz/.fastq file.") ; return(FALSE) }
fastq_file_pair <- sample_seq_file_pair
}
sample_seq_file <- as.character(sample_seq_file[1])
}
if(file_ext(sample_seq_file) == "gz"){# unzip
fastq_file <- "tmp/orig.fastq"
gunzip(sample_seq_file,fastq_file,overwrite=TRUE,remove=FALSE)
tmp_files <- c(tmp_files,fastq_file)
}else{
if(file_ext(sample_seq_file) != "fastq"){ octopus.info("expecting .gz/.fastq file.") ; return(FALSE) }
fastq_file <- sample_seq_file
}
#do alignment
sam_file <- "tmp/orig.sam"
tmp_files <- c(tmp_files,sam_file)
# S=TRUE # SAM format ,
# q=TRUE # query input files are FASTQ .fq/.fastq
ext_args <- list(...)
if(type=="paired") {
print(paste("fist mate pair sequences:",sample_seq_file))
print(paste("second mate pair sequences:",sample_seq_file_pair))
ext_args <- append(list(sequences=list(fastq_file,fastq_file_pair),index=refs,outfile=sam_file, type=type,force=TRUE,S=TRUE,q=TRUE),ext_args) # ,y=TRUE
}else{
ext_args <- append(list(sequences=fastq_file,index=refs,outfile=sam_file, type=type,force=TRUE,S=TRUE,q=TRUE),ext_args)
}
do.call(bowtie,ext_args)
#read the sam file.
ret <- scan(sam_file,what=list(NULL,NULL,""),comment.char="@",sep="\t",flush=T)[[3]] # We only care about the third column.also discard the header info (rows starting with "@") -- Mike Covington - Julin Maloofs lab
file.remove(tmp_files) # clean up
if(length(ret)==0){ print(paste("empty results ...")) ; return(ret) }
ret <- as.data.frame(table(ret))
colnames(ret) <- c("gene","reads") #set column name
ret
}
tutorial.octopus.short_reads <- function(topic=NULL){
tutorial.octopus.short_reads.do <- function(){
# single
references <- "seq_data/cdna/Arabidopsis_thaliana.TAIR10.25.cdna.all.fa"
seq_files <- data.frame(seq_file=c("seq_data/1_AACGTGAT_L003_R1_001.fastq.gz","seq_data/1_AACGTGAT_L007_R1_001.fastq","seq_data/3_AACGTGAT_L003_R1_001.fastq.gz")
,sample_name=c("sample1","sample2","sample3")
,stringsAsFactors = FALSE)
row.names(seq_files) <- seq_files$sample_name
octopus.short_reads(seq_files,references
,p=3 # number of alignment threads to launch
,`phred33-quals`=TRUE # input quals are Phred+33
,t=TRUE # print wall-clock time taken by search phases
,quiet=TRUE # print nothing but the alignments
,trim5=10 # trim <int> bases from 5' (left) end of reads
)
# paired
references <- "seq_data/cdna/Arabidopsis_thaliana.TAIR10.25.cdna.all.fa"
seq_files <- data.frame(seq_file=c("seq_data/A9_S1_L001_R1_001.fastq.gz","seq_data/xxx_R1_001.fastq.gz","seq_data/xxxx_R1_001.fastq.gz")
,seq_file_pair=c("seq_data/A9_S1_L001_R2_001.fastq.gz","seq_data/xxx_R2_001.fastq.gz","seq_data/xxxx_R2_001.fastq.gz")
,sample_name=c("sample1","sample2","sample3")
,stringsAsFactors = FALSE
)
row.names(seq_files) <- seq_files$sample_name
octopus.short_reads(seq_files,references
,p=3 # number of alignment threads to launch
,`phred33-quals`=TRUE # input quals are Phred+33
,t=TRUE # print wall-clock time taken by search phases
,quiet=TRUE # print nothing but the alignments
,trim5=10 # trim <int> bases from 5' (left) end of reads
# ,y=TRUE # more sensitive but much slower, see http://bowtie-bio.sourceforge.net/manual.shtml#bowtie-options-y
,type="paired"
,dist_dir="results_paired/"
)
# multiple referencing files
references <- c("seq_data/ref_sequences/Botrytisfusarivirus1.txt","seq_data/ref_sequences/BotrytisHypovirus1.txt")
seq_files <- data.frame(seq_file=c("seq_data/1_AACGTGAT_L003_R1_001.fastq.gz","seq_data/1_AACGTGAT_L007_R1_001.fastq","seq_data/3_AACGTGAT_L003_R1_001.fastq.gz")
,sample_name=c("sample1","sample2","sample3")
,stringsAsFactors = FALSE)
row.names(seq_files) <- seq_files$sample_name
octopus.short_reads(seq_files,references
,p=3 # number of alignment threads to launch
,`phred33-quals`=TRUE # input quals are Phred+33
,t=TRUE # print wall-clock time taken by search phases
,quiet=TRUE # print nothing but the alignments
,trim5=10 # trim <int> bases from 5' (left) end of reads
)
}
print(tutorial.octopus.short_reads.do)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.