rnaseqProcess: Constructor for class PreProcessData

Description Usage Arguments Details Value Examples

View source: R/RNAseq.R

Description

Preprocess raw RNASeq data into quantified data.

Usage

1
rnaseqProcess(Gfasta, targets, reference, gff1, type, plot = TRUE)

Arguments

Gfasta

bowtie-build command for building index

targets

txt file containg name of all sample

reference

refrence genome index file name

gff1

Annotation data

type

Single or Paired end data

plot

Details

rnaseqProcess function use command line tool bowtie. It takes .fasta and .fastq file align the reads and generate output bam files. If user already has aligned reads in .bam format that can also use readAlign function. It annotates reads by importing data from gff1. After this it reads counts per annotation and normalize them.rnaseqProcess function use command line tool bowtie. It takes .fasta and .fastq file align the reads and generate output .bam files. If user already has aligned reads in .bam format that can also use readAlign function. It annotates reads by importing data from gff1. After this it reads counts per annotation and normalize them.

Value

Return an object of S4 class PreProcessData.

Examples

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
##---- Should be DIRECTLY executable !! ----
##-- ==>  Define data, use random,
##--	or do  help(data=index)  for the standard data sets.

## The function is currently defined as
function (Gfasta, targets, reference, gff1, type, plot = TRUE) 
{
    command <- paste("bowtie2-build", Gfasta, Gfasta)
    command1 <- paste("bowtie2 -x", Gfasta)
    system(command)
    if (type == "single") {
        for (i in seq(along = targets$Fastq)) {
            input <- paste("./", targets$Fastq, sep = "")
            output <- paste("./", targets$Fastq, ".sam", sep = "")
            command2 <- paste(command1, input[i], "-S", output[i])
            system(command2)
            Rsamtools::asBam(file = output[i], destination = gsub(".sam", 
                "", output[i]), overwrite = TRUE, indexDestination = TRUE)
            unlink(output[i])
        }
    }
    else {
        for (i in seq(along = targets$Fastq)) {
            input <- paste("./", targets$file1, sep = "")
            input1 <- paste("./", targets$file2, sep = "")
            output <- paste("./", targets$Fastq, ".sam", sep = "")
            command2 <- paste(command1, "-1", input[i], "-2", 
                input1[i], "-S", output[i])
            system(command2)
            Rsamtools::asBam(file = output[i], destination = gsub(".sam", 
                "", output[i]), overwrite = TRUE, indexDestination = TRUE)
            unlink(output[i])
        }
    }
    library(Biostrings)
    library(IRanges)
    library(GenomicRanges)
    gtf2GRangesList <- function(myfile = "my.gff") {
        gtf <- read.delim(myfile, header = FALSE)
        colnames(gtf) <- c("seqname", "source", "feature", "start", 
            "end", "score", "strand", "frame", "attributes")
        chronly <- c(1:22, "X", "Y", "MT")
        gtf <- gtf[as.character(gtf$seqname) %in% chronly, ]
        gene_ids <- gsub(".*gene_id (.*?);.*", "\1", gtf$attributes)
        transcript_ids <- gsub(".*transcript_id (.*?);.*", "\1", 
            gtf$attributes)
        index <- gene_ids != ""
        index2 <- transcript_ids != ""
        gene.gr <- GRanges(seqnames = gtf$seqname[index], ranges = IRanges(gtf$start[index], 
            gtf$end[index]), strand = gtf$strand[index], tx_id = transcript_ids[index], 
            gene_id = gene_ids[index])
        gene.gr.list <- split(gene.gr, gene_ids[index])
        transcript.gr <- GRanges(seqnames = gtf$seqname[index2], 
            ranges = IRanges(gtf$start[index2], gtf$end[index2]), 
            strand = gtf$strand[index2], tx_id = transcript_ids[index2], 
            gene_id = gene_ids[index2])
        transcript.gr.list <- split(transcript.gr, transcript_ids[index2])
        r <- list()
        r$gene <- gene.gr.list
        r$transcript <- transcript.gr.list
        return(r)
    }
    gffsub <- gtf2GRangesList(gff1)
    samples <- as.character(targets$Fastq)
    samplespath <- paste("./", samples, ".bam", sep = "")
    names(samplespath) <- samples
    bfl <- Rsamtools::BamFileList(samplespath, index = character())
    countDF2 <- GenomicRanges::summarizeOverlaps(gffsub$gene, 
        bfl, mode = "Union", ignore.strand = TRUE)
    countDF2 <- as.matrix(assays(countDF2)$counts)
    if (plot == TRUE) {
        pdf("BarPlot.pdf")
        barplot(colSums(countDF2) * 1e-06, ylab = "Library size (millions)")
        dev.off()
    }
    g <- gffsub$gene
    returnRPKM <- function(counts, g) {
        geneLengthsInKB <- sum(width(g))/1000
        millionsMapped <- sum(counts)/1e+06
        rpm <- counts/millionsMapped
        rpkm <- rpm/geneLengthsInKB
        return(rpkm)
    }
    countDFrpkm <- apply(countDF2, 2, function(x) returnRPKM(counts = x, 
        g = g))
    new("PreProcessData", qData = as.matrix(countDFrpkm), phenoData = phenodata)
  }

HTDA documentation built on May 2, 2019, 4:53 p.m.