inst/doc/dupRadar.R

## ----pre,echo=FALSE,results='hide'--------------------------------------------
library(knitr)
opts_chunk$set(warning=FALSE,message=FALSE,cache=TRUE)

## ----style,echo=FALSE,results='asis'------------------------------------------
BiocStyle::markdown()

## ----loadLibrary--------------------------------------------------------------
library(dupRadar)

## ----eval=FALSE---------------------------------------------------------------
#  # call the duplicate marker and analyze the reads
#  bamDuprm <- markDuplicates(dupremover="bamutil",
#  			   bam="test.bam",
#  	    	   path="/opt/bamUtil-master/bin",
#  			   rminput=FALSE)

## -----------------------------------------------------------------------------
if(suppressWarnings(require(AnnotationHub))) {
    ah = AnnotationHub()
    query(ah, c("ensembl", "80", "Takifugu", "gtf"))    # discovery
    cache(ah["AH47101"])                                # retrieve / file path
}

## -----------------------------------------------------------------------------
attach(dupRadar_examples)

## ----eval=FALSE---------------------------------------------------------------
#  # The call parameters:
#  bamDuprm <- "test_duprm.bam"	# the duplicate marked bam file
#  gtf <- "genes.gtf"	# the gene model
#  stranded <- 2		# '0' (unstranded), '1' (stranded) and '2' (reversely stranded)
#  paired   <- FALSE	# is the library paired end?
#  threads  <- 4		# number of threads to be used
#  
#  # Duplication rate analysis
#  dm <- analyzeDuprates(bamDuprm,gtf,stranded,paired,threads)

## ----fig.width=14,fig.height=7------------------------------------------------
## make a duprate plot (blue cloud)
par(mfrow=c(1,2))
duprateExpDensPlot(DupMat=dm)		# a good looking plot
title("good experiment")
duprateExpDensPlot(DupMat=dm.bad)	# a dataset with duplication problems
title("duplication problems")

## duprate boxplot
duprateExpBoxplot(DupMat=dm)		# a good looking plot
duprateExpBoxplot(DupMat=dm.bad)	# a dataset with duplication problems

## ----fig.width=7,fig.height=7-------------------------------------------------
duprateExpDensPlot(DupMat=dm)

# or, just to get the fitted model without plot
fit <- duprateExpFit(DupMat=dm)
cat("duprate at low read counts: ",fit$intercept,"\n",
	"progression of the duplication rate: ",fit$slope,fill=TRUE)

## ----eval=FALSE---------------------------------------------------------------
#  ## INTERACTIVE: identify single points on screen (name="ID" column of dm)
#  duprateExpPlot(DupMat=dm)		# a good looking plot
#  duprateExpIdentify(DupMat=dm)

## ----fig.width=14,fig.height=7------------------------------------------------
par(mfrow=c(1,2))
cols <- colorRampPalette(c("black","blue","green","yellow","red"))
duprateExpPlot(DupMat=dm,addLegend=FALSE)
duprateExpPlot(DupMat=dm.bad,addLegend=FALSE,nrpoints=10,nbin=500,colramp=cols)

## ----fig.width=7,fig.height=7-------------------------------------------------
readcountExpBoxplot(DupMat=dm)

## ----fig.width=7,fig.height=7-------------------------------------------------
expressionHist(DupMat=dm)

## -----------------------------------------------------------------------------
# calculate the fraction of multimappers per gene
dm$mhRate <- (dm$allCountsMulti - dm$allCounts) / dm$allCountsMulti
# how many genes are exclusively covered by multimappers
sum(dm$mhRate == 1, na.rm=TRUE)

# and how many have an RPKM (including multimappers) > 5
sum(dm$mhRate==1 & dm$RPKMMulti > 5, na.rm=TRUE)

# and which are they?
dm[dm$mhRate==1 & dm$RPKMMulti > 5, "ID"]

## ----fig.width=7,fig.height=7-------------------------------------------------
hist(dm$mhRate, 
     breaks=50, 
     col="red", 
     main="Frequency of multimapping rates", 
     xlab="Multimapping rate per gene", 
     ylab="Frequency")


## ----fig.width=7,fig.height=7-------------------------------------------------
# comparison of multi-mapping RPK and uniquely-mapping RPK
plot(log2(dm$RPK), 
     log2(dm$RPKMulti), 
     xlab="Reads per kb (uniquely mapping reads only)",
     ylab="Reads per kb (all including multimappers, non-weighted)"
)

## ---- eval=F------------------------------------------------------------------
#  identify(log2(dm$RPK),
#           log2(dm$RPKMulti),
#  	 labels=dm$ID)

## ----eval=F-------------------------------------------------------------------
#  library(dupRadar)
#  library(biomaRt)
#  
#  ## for detailed explanations on biomaRt, please see the respective
#  ## vignette
#  
#  ## set up biomart connection for mouse (needs internet connection)
#  ensm <- useMart("ensembl")
#  ensm <- useDataset("mmusculus_gene_ensembl", mart=ensm)
#  
#  ## get a table which has the gene GC content for the IDs that have been
#  ## used to generate the table (depends on the GTF annotation that you
#  ## use)
#  tr <- getBM(attributes=c("mgi_symbol", "percentage_gc_content"),
#              values=TRUE, mart=ensm)
#  
#  ## create a GC vector with IDs as element names
#  mgi.gc <- tr$percentage_gc_content
#  names(mgi.gc) <- tr$mgi_symbol
#  
#  ## using dm demo duplication matrix that comes with the package
#  ## add GC content to our demo data and keep only subset for which we can
#  ## retrieve data
#  keep <- dm$ID %in% tr$mgi_symbol
#  dm.gc <- dm[keep,]
#  dm.gc$gc <- mgi.gc[dm.gc$ID]
#  
#  ## check distribution of annotated gene GC content (in %)
#  boxplot(dm.gc$gc, main="Gene GC content", ylab="% GC")

## ----eval=F-------------------------------------------------------------------
#  par(mfrow=c(1,2))
#  
#  ## below median GC genes
#  duprateExpDensPlot(dm.gc[dm.gc$gc<=45,], main="below median GC genes")
#  
#  ## above median GC genes
#  duprateExpDensPlot(dm.gc[dm.gc$gc>=45,], main="above median GC genes")

## ----eval=F-------------------------------------------------------------------
#  #!/usr/bin/env Rscript
#  
#  ########################################
#  ##
#  ## dupRadar shell script
#  ## call dupRadar R package from the shell for
#  ## easy integration into pipelines
#  ##
#  ## Holger Klein & Sergi Sayols
#  ##
#  ## https://github.com/ssayols/dupRadar
#  ##
#  ## input:
#  ## - _duplicate marked_ bam file
#  ## - gtf file
#  ## - parameters for duplication counting routine:
#  ##   stranded, paired, outdir, threads.
#  ##
#  ########################################
#  
#  library(dupRadar)
#  
#  ####################
#  ##
#  ## get name patterns from command line
#  ##
#  args   <- commandArgs(TRUE)
#  
#  ## the bam file to analyse
#  bam <- args[1]
#  ## usually, same GTF file as used in htseq-count
#  gtf <- gsub("gtf=","",args[2])
#  ## no|yes|reverse
#  stranded <- gsub("stranded=","",args[3])
#  ## is a paired end experiment
#  paired   <- gsub("paired=","",args[4])
#  ## output directory
#  outdir   <- gsub("outdir=","",args[5])
#  ## number of threads to be used
#  threads  <- as.integer(gsub("threads=","",args[6]))
#  
#  if(length(args) != 6) {
#    stop (paste0("Usage: ./dupRadar.sh <file.bam> <genes.gtf> ",
#  			   "<stranded=[no|yes|reverse]> paired=[yes|no] ",
#  			   "outdir=./ threads=1"))
#  }
#  
#  if(!file.exists(bam)) {
#    stop(paste("File",bam,"does NOT exist"))
#  }
#  
#  if(!file.exists(gtf)) {
#    stop(paste("File",gtf,"does NOT exist"))
#  }
#  
#  if(!file.exists(outdir)) {
#    stop(paste("Dir",outdir,"does NOT exist"))
#  }
#  
#  if(is.na(stranded) | !(grepl("no|yes|reverse",stranded))) {
#    stop("Stranded has to be no|yes|reverse")
#  }
#  
#  if(is.na(paired) | !(grepl("no|yes",paired))) {
#    stop("Paired has to be no|yes")
#  }
#  
#  if(is.na(threads)) {
#    stop("Threads has to be an integer number")
#  }
#  
#  stranded <- if(stranded == "no") 0 else if(stranded == "yes") 1 else 2
#  
#  ## end command line parsing
#  ##
#  ########################################
#  
#  ########################################
#  ##
#  ## analyze duprates and create plots
#  ##
#  cat("Processing file ", bam, " with GTF ", gtf, "\n")
#  
#  ## calculate duplication rate matrix
#  dm <- analyzeDuprates(bam,
#                        gtf,
#                        stranded,
#                        (paired == "yes"),
#                        threads)
#  
#  ## produce plots
#  
#  ## duprate vs. expression smooth scatter
#  png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_drescatter.png"),
#      width=1000, height=1000)
#  duprateExpDensPlot(dm, main=basename(bam))
#  dev.off()
#  
#  ## expression histogram
#  png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_ehist.png"),
#      width=1000, height=1000)
#  expressionHist(dm)
#  dev.off()
#  
#  ## duprate vs. expression boxplot
#  png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_drebp.png"),
#      width=1000, height=1000)
#  par(mar=c(10,4,4,2)+.1)
#  duprateExpBoxplot(dm, main=basename(bam))
#  dev.off()

## ----citation-----------------------------------------------------------------
citation("dupRadar")

## ----echo=FALSE---------------------------------------------------------------
sessionInfo()

Try the dupRadar package in your browser

Any scripts or data that you put into this service are public.

dupRadar documentation built on Nov. 8, 2020, 5:41 p.m.