octopus.short_reads: Extract Short Reads
In WeiZhang317/octopus: Tools for GLM NB analysis

Description Usage Arguments Examples

Extract short reads from seq_files using bowtie.

Output 2 files, dist_dir/orig.reads.csv and dist_dir/orig.reads.info.csv

Remove dist_dir/reusing/ folder if don't want reuse data from that folder

Setup ./tmp/ as an RAM disk folder will avoid lots of disk IOs, speed things up and protect you SSD.

Example for linux :

cd to curent folder.
create ram disk in console : mount -t tmpfs -o size=4g tmpfs ./tmp/

Example for mac :

cd to curent folder.
mkdir -p tmp
sudo mount -t tmpfs -o size=4096M tmpfs ./tmp/

For windows, there are a number of RAM disk softerwares you can use.

octopus.short_reads(seq_files, references, ..., type = "single",


  dist_dir = "results/")

`seq_files`	Sequencing files , accepts .fastq or .gz format for files. If `seq_files` is a `List` or `Vector`, index of `seq_files` is assumed to be sampe names of sequencing files. If `seq_files` is a `data.frame`, row.names is assumed to be sampe names of sequencing files, the first column is assumed to be sequencing files, when `type` is `paired` the second column is assumed to be the second mate pair sequences.
`references`	A comma-separated list of FASTA files containing the reference sequences to be aligned to
`...`	Additional arguments to be passed on to the binaries. See ... of bowtie
`type`	Could be one of c("single", "paired", "crossbow"). If single, the input sequences are interpreted as single reads. If paired, they are supposed to be mate pair reads. If crossbow, they are considered to be Crossbow-style reads.
`dist_dir`	folder for result file orig.reads.csv and orig.reads.info.csv

{

    # single
    references <- "seq_data/cdna/Arabidopsis_thaliana.TAIR10.25.cdna.all.fa"
    seq_files <- data.frame(seq_file=c("seq_data/1_AACGTGAT_L003_R1_001.fastq.gz","seq_data/1_AACGTGAT_L007_R1_001.fastq","seq_data/3_AACGTGAT_L003_R1_001.fastq.gz")
                            ,sample_name=c("sample1","sample2","sample3")
                            ,stringsAsFactors = FALSE)
    row.names(seq_files) <- seq_files$sample_name

    octopus.short_reads(seq_files,references
                        ,p=3 # number of alignment threads to launch
                        ,`phred33-quals`=TRUE # input quals are Phred+33
                        ,t=TRUE # print wall-clock time taken by search phases
                        ,quiet=TRUE # print nothing but the alignments
                        ,trim5=10 # trim <int> bases from 5' (left) end of reads
    )

    # paired

    references <- "seq_data/cdna/Arabidopsis_thaliana.TAIR10.25.cdna.all.fa"
    seq_files <- data.frame(seq_file=c("seq_data/A9_S1_L001_R1_001.fastq.gz","seq_data/xxx_R1_001.fastq.gz","seq_data/xxxx_R1_001.fastq.gz")
                            ,seq_file_pair=c("seq_data/A9_S1_L001_R2_001.fastq.gz","seq_data/xxx_R2_001.fastq.gz","seq_data/xxxx_R2_001.fastq.gz")
                            ,sample_name=c("sample1","sample2","sample3")
                            ,stringsAsFactors = FALSE
                            )
    row.names(seq_files) <- seq_files$sample_name

    octopus.short_reads(seq_files,references
                        ,p=3 # number of alignment threads to launch
                        ,`phred33-quals`=TRUE # input quals are Phred+33
                        ,t=TRUE # print wall-clock time taken by search phases
                        ,quiet=TRUE # print nothing but the alignments
                        ,trim5=10 # trim <int> bases from 5' (left) end of reads
                        # ,y=TRUE # more sensitive but much slower, see http://bowtie-bio.sourceforge.net/manual.shtml#bowtie-options-y
                        ,type="paired"
                        ,dist_dir="results_paired/"
    )

    # multiple referencing files
    references  <- c("seq_data/ref_sequences/Botrytisfusarivirus1.txt","seq_data/ref_sequences/BotrytisHypovirus1.txt")
    seq_files <- data.frame(seq_file=c("seq_data/1_AACGTGAT_L003_R1_001.fastq.gz","seq_data/1_AACGTGAT_L007_R1_001.fastq","seq_data/3_AACGTGAT_L003_R1_001.fastq.gz")
                            ,sample_name=c("sample1","sample2","sample3")
                            ,stringsAsFactors = FALSE)
    row.names(seq_files) <- seq_files$sample_name

    octopus.short_reads(seq_files,references
                        ,p=3 # number of alignment threads to launch
                        ,`phred33-quals`=TRUE # input quals are Phred+33
                        ,t=TRUE # print wall-clock time taken by search phases
                        ,quiet=TRUE # print nothing but the alignments
                        ,trim5=10 # trim <int> bases from 5' (left) end of reads
    )


  }