knitr::opts_chunk$set( collapse = TRUE, comment = "#>", echo = TRUE, warning=FALSE, message = FALSE )
For conversion from .bam format to .rda we can use the bamToR()
function. This function uses a Rsamtools package from the Bioconductor, so we have to load this package before.
library(Rsamtools) library(directRNAExplorer)
bamData <- bamToR("P13.2016-11-17T16_40_36.1234062861011.fc1.ch1.sorted.bam")
We have to remember that we should set path to a directory containing this bam file.
Using frame created above we can find out, for each gene, how many nucleotides were recorded during the sequencing. For genes aggrergation we use the TAIR10_genes
data. In my examples I use data for first chromosome.
datawt <- dataChromosome1 databrm <- brmDataChromosome1
head(datawt)
genesSummary <- genesSummarize(datawt, chromosome = 1) genesSummary <- dplyr::arrange(genesSummary, desc(counts)) head(genesSummary)
We can compare the distribution of counts between two genes or part of the gene (for example three prime UTR, exons). We also can compare distributions between positive or negative strands. We can use for this comparision for example the two-sample Kolmogorov-Smirnov test. In next example I will use the data for gene "AT1G06680".
ksDistributionTest(datawt, databrm, "AT1G06760")
For selected parts of chromosomes in two groups we can compute the table with p-values for chosen test about their distributions.
table <- pvalTable(datawt, databrm, chromosome="1", type ="KS") head(dplyr::arrange(table, pval),6)
For chosen gene we can draw a coverage plot for datawt
and databrm
.
First we have plots for the three prime UTR.
plotGeneDistribution(gene="AT1G06760", bamDataFrame=datawt) plotGeneDistribution(gene="AT1G06760", bamDataFrame=databrm)
And plots for whole gene.
plotGeneDistribution(gene="AT1G06760", bamDataFrame=datawt) plotGeneDistribution(gene="AT1G06760", bamDataFrame=databrm)
We can also see the distribution of these two samples on one plot.
In this case red color lines correspond to data from first dataset and blue - to second.
We can also see the ecdf of these two samples on one plot.
plotGeneComparisonDistribution("AT1G06760", datawt, databrm,stat="ecdf")
For two types of individuals we can compute negative binomial test to find significantly changed genes between one and the other type.
Firstly, we have to compute, from sequencing data, the table of counts for each gene from Arabidopsis. We can create this table using countsForAllGenes
function.
counts_genes_wt1 <- countsForAllGenes(dataChromosome1, geneData = directRNAExplorer::Araport11, genePart="gene")
To test differences between types of individuals we must compute at least two counts tables. One for the first type, and then for the second type. I've create a table contains inforamation for two types - wild type and a mutant type (brm). We have seven samples in our test.
allCounts <- directRNAExplorer::countsAll
We want to test these genes which could be significant. Firstly, we should select genes which have "good number" of counts. We assumed that the counts are "good" if more than at least 2 samples in the wt case and at least 3 samples in brm case we have more than 9 counts for gene. In addition, we are looking for such genes that have many counts in one of the subtypes, and almost zero in the second one.
genesTest <-c() countsWithCondition <- allCounts for(i in 2:ncol(allCounts)){ countsWithCondition[,i] <- ifelse(allCounts[,i]>=9, 1, 0) } allCounts_2 <- countsWithCondition genes <- allCounts_2[,1] allCountsTransposed <- t(allCounts_2[,-1]) allCountsTransposed <- data.frame(allCountsTransposed) colnames(allCountsTransposed)<- genes allCountsTransposed$type <- c(rep("wt", 3), rep("c", 4)) allCountsTransposed <- allCountsTransposed[,c(ncol(allCountsTransposed), 1:(ncol(allCountsTransposed)-1))] sums <- aggregate(. ~ type, data=allCountsTransposed, FUN=sum) for(i in 2:ncol(allCountsTransposed)){ if(((sums[1,i]>=3) & (sums[2,i]>=2)) || ((sums[1,i]>=3) & (sums[2,i]==0))||((sums[1,i]==0) & (sums[2,i]>=2))){ genesTest[i] <- colnames(allCountsTransposed)[i] }else{ genesTest[i] <- NA } } genesTest2<- na.omit(genesTest) countsTest <- allCounts countsTest <- countsTest[which(countsTest$name %in% genesTest2),] countsTest2 <- data.frame(t(countsTest[,-1])) colnames(countsTest2) <- countsTest$name
For selected genes we perform a binomial negative test, in order to find genes for which the number of readings has changed significantly divided into two groups - wt and brm.
cond <- c(rep("wt", 3), rep("c", 4)) test<-nbinomTest(countsTest2, condition = cond)
To find the most changed genes, we will focus only on those that have a small pvalue and fold greater than 1.5.
library(dplyr) testFiltered <- filter(test, pvalue<0.05) testFiltered <- filter(testFiltered, abs(log2FoldChange)>0.6) testFiltered <- arrange(testFiltered, pvalue) testFiltered <- arrange(testFiltered, desc(log2FoldChange)) head(testFiltered)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.