R/ShortRead-methods.R

Defines functions naPositionFilter chastityFilter

Documented in chastityFilter naPositionFilter

#' Methods extending the ShortRead package functionalities
#'
#' These are functions extending the ShortRead packages capabilities:
#'
#' \itemize{
#' \item\code{barcodePlot} Creates a plot showing the barcode
#' distribution of a multiplexed sequencing library.
#' \item\code{chastityFilter} Creates a \code{\linkS4class{SRFilter}} instance
#' that filters SolexaExport read according to the chastity filtering value.
#' \item\code{demultiplex} Split a single \code{\linkS4class{AlignedRead}}
#' object into a list of \code{\linkS4class{AlignedRead}} objects according to
#' the barcodes provided by the user. It supports multicore processing
#' but has a default serial behaviour.
#' \item\code{naPositionFilter} Creates a
#' \code{\linkS4class{SRFilter}} instance that filters SolexaExport read
#' having an NA position.  }
#'
#' When demultiplexing, the function if provided with just the
#' \code{\linkS4class{AlignedRead}} will try to find out how many barcodes
#' were used and what they are. This is unwise to do as many barcodes will get
#' wrongly sequenced and not always the most frequent ones are the one you
#' used! It's therefore strongly advised to specify the barcodes' sequences
#' that were used.
#'
#' @name ShortRead additional methods
#' @rdname ShortRead-methods
#' @aliases barcodePlot barcodePlot,AlignedRead-method
#' barcodePlot,DNAStringSet-method barcodePlot,ShortReadQ-method
#' chastityFilter chastityFilter,SRFilter-method naPositionFilter
#' naPositionFilter,SRFilter-method demultiplex demultiplex,AlignedRead-method
#' demultiplex,DNAStringSet-method demultiplex,ShortReadQ-method alignData
#' @docType methods
#' @usage demultiplex(obj,barcodes=c(),barcodes.qty=12,barcode.length=6,
#' edition.dist=2,type=c("independant","within"),index.only=FALSE,mc.cores=1L)
#' barcodePlot(obj,barcodes=c(),type=c("independant","within"),
#' barcode.length=6,show.barcode=20,...)
#' chastityFilter(.name="Illumina Chastity Filter")
#' naPositionFilter(.name="NA Position Filter")
#' @param .name An internal string describing the filter
#' @param obj An object derived from class \code{\linkS4class{AlignedRead}}
#' @param barcodes A character vector describing the multiplex (i.e. barcode)
#' sequences used in the experiment.
#' @param barcodes.qty An integer describing the number of barcodes
#' @param barcode.length An integer describing the barcode length in bp
#' @param edition.dist The maximal edition distance (i.e. the number of
#' changes to apply), to accept an incorrectly sequenced barcode.
#' @param index.only simply return the index and not the barcode themselves.
#' @param mc.cores A parameter ultimately passed to srdistance to enable
#' parallel processing on mc.cores. On linux and Mac only, windows task remain
#' serially processed.
#' @param show.barcode An integer specifying how many barcodes should be
#' displayed in the final output.
#' @param type The type of barcode used. \code{independent} represents
#' barcodes generated by the illumina protocol; i.e. a separate additional
#' sequencing step performed once the first mate has been sequenced.
#' \code{within} represents barcodes that are part of the sequenced reads as
#' established by Lefrancois P et al., BMC Genomics, 2009
#' @param ... additional graphic parameters
#' @return \itemize{ \item\code{barcodePlot} returns invisibly the barcode
#' frequencies.  \item\code{chastityFilter} returns a
#' \code{\linkS4class{SRFilter}} instance.  \item\code{demultiplex} returns a
#' list of \code{\linkS4class{AlignedRead}} objects.
#' \item\code{naPositionFilter} returns a \code{\linkS4class{SRFilter}}
#' instance.  }
#' @author Nicolas Delhomme
#' @seealso \code{\linkS4class{SRFilter}} \code{\linkS4class{AlignedRead}}
#' @keywords methods
#' @examples
#'
#' 	\dontrun{
#' 	# the barcode
#' 	barcodes=c("ACACTG","ACTAGC","ATGGCT","TTGCGA")
#'
#' invisible(download.file(paste0("https://github.com/UPSCb/UPSCb/raw/",
#'        "master/tutorial/easyRNASeq/multiplex_export.txt.gz"),
#'        "multiplex_export.txt.gz"))
#'
#' 	# the multiplexed data
#' 	alns <- readAligned(".",
#'                     pattern="multiplex_export",
#'                     filter=compose(
#'                       chastityFilter(),
#'                       nFilter(2),
#'                       chromosomeFilter(regex="chr")),
#'                     type="SolexaExport",
#'                     withAll=TRUE)
#'
#' 	# barcode plot
#' 	barcodePlot(alns,
#'             barcodes=barcodes,
#'             type="within",
#'             barcode.length=6,
#'             show.barcode=20,
#'             main="All samples",
#'             xlim=c(0,0.5))
#'
#' 	# demultiplexing
#' 	dem.alns <- demultiplex(alns,
#'                         barcodes=barcodes,
#'                         edition.dist=2,
#'                         barcodes.qty=4,
#'                         type="within")
#'
#' 	# plotting again
#' 	par(mfrow=c(2,2))
#' 	barcode.frequencies <- lapply(
#'                               names(dem.alns$barcodes),
#'                               function(barcode,alns){
#'                                 barcodePlot(
#'                                             alns$barcodes[[barcode]],
#'                                             barcodes=barcode,
#'                                             type="within",barcode.length=6,
#'                                             show.barcode=20,
#'                                             main=paste(
#'                                               "Expected barcode:",
#'                                               barcode))
#'                               },dem.alns)
#' 	}
#'
# de-multiplex multiplexed libs
setMethod(
          f="demultiplex",
          signature="AlignedRead",
          definition=function(obj,barcodes=c(),
                              barcodes.qty=12,
                              barcode.length=6,
                              edition.dist=2,
                              type=c("independant","within"),
                              index.only=FALSE,
                              mc.cores=1L){

            # TODO we only want one type!!
            # default to independant

            # check the input
            types <- eval(formals("demultiplex")$type)
            if(!type %in% types){
              stop(paste(
                         "The given type:",
                         type,
                         "is not part of the supported types:",
                         paste(types,collapse=", ")))
            }

            # do we have barcodes
            if(length(barcodes)==0){
              barcodes <- switch(type,
                independant=sort(table(alignData(obj)$multiplexIndex),
                                 decreasing=TRUE)[1:barcodes.qty],
                within=sort(table(as.character(narrow(sread(obj),start=1,
                                                      width=barcode.length))),
                            decreasing=TRUE)[1:barcodes.qty])
            } else {
              if(length(unique(nchar(barcodes)))!=1){
                stop("We accept only barcodes having the same length.")
              }
              if(!all(nchar(barcodes) == barcode.length)){
                warning(paste("The provided barcode length was not correct.",
                              "Set it to:",nchar(barcodes[1])))
                barcode.length=nchar(barcodes[1])
              }
            }

            # get the barcodes, according to a certain size
            all.barcodes <- switch(type,
                               independant=DNAStringSet(
                                 as.character(alignData(obj)$multiplexIndex)),
                               within=narrow(sread(obj),start=1,
                                             width=barcode.length)
                               )

            # just for illumina
            if(type=="independant" & nchar(all.barcodes)[1] > barcode.length){
              all.barcodes <- narrow(all.barcodes,start=1,width=barcode.length)
            }

            # calculate the edition distance
            # dist<-do.call(cbind,srdistance(barcodes,barcodes))
            # or this to speed up
            # calc the dist only on the unique barcodes
            # rather than calling unique, generate all possibilities instead
            # and map it back
            unique.barcodes <- DNAStringSet(
              apply(
                as.matrix(eval(parse(
                  text=paste(
                    "expand.grid(",
                    paste(rep("c('A','C','G','T','N')",barcode.length),collapse=","),
                    ",stringsAsFactors=FALSE)")
                ))),1,paste,collapse=""))

            dist <- do.call(cbind,
                            srdistance(unique.barcodes,
                                       barcodes,
                                       BPPARAM=.getBpParam(mc.cores)))[match(all.barcodes,
                                                                             unique.barcodes),]

            # create a selector per barcode
            sels <- lapply(barcodes,
                           function(barcode,dist,edition.dist){
                             i.sel <- colnames(dist)==barcode
                             dist[,i.sel]<=edition.dist & eval(parse(text=paste(paste("dist[,",which(!i.sel),"] > edition.dist",collapse=" & "))))
                           },dist,edition.dist)

            # validate
            if(sum(sapply(sels,sum))>length(all.barcodes)){
              warning(
                      paste(
                            "You have more reads selected: ",sapply(sels,sum),"using the edition distance:",
                            edition.dist,"than you have reads:",length(all.barcodes),"!\nUse the barcodePlot function to visually assess it."))
            }
            # select barcodes
            names(sels)<-colnames(dist)

            # either the alns or the sels are returned
            if(index.only){
              return(sels)
            } else {
              # original read length
              read.length <- width(obj)[1]

              # return a lists of aln
              alns <- lapply(barcodes,function(barcode,obj,sels){
                reads <- narrow(obj[sels[[barcode]]],start=barcode.length+1,width=read.length-barcode.length)
                bars <- narrow(obj[sels[[barcode]]],start=1,width=barcode.length)
                return(list(reads=reads,bars=bars))
              },obj,sels)
              names(alns) <- colnames(dist)
              alns <- list(reads=lapply(alns,function(x){x[[1]]}),barcodes=lapply(alns,function(x){sread(x[[2]])}))
              return(alns)
            }
          })

setMethod(
          f="demultiplex",
          signature="DNAStringSet",
          definition=function(obj,barcodes=c(),
                              barcodes.qty=12,
                              barcode.length=6,
                              edition.dist=2,
                              type=c("independant","within"),
                              index.only=FALSE,
                              mc.cores=1L){

            # There's only one possible type!!
            # TODO extract this error to share it with other methods that go for a DNAStringSet
            if(type=="independant"){
              stop(paste("We cannot accept the independant argument for a ",class(obj),", since we miss the barcode sequences",sep=""))
            }

            # check the input
            types <- eval(formals("demultiplex")$type)[-1]
            if(!type %in% types){
              stop(paste(
                         "The given type:",
                         type,
                         "is not part of the supported types:",
                         paste(types,collapse=", ")))
            }

            # do we have barcodes
            if(length(barcodes)==0){
              barcodes <- sort(table(as.character(narrow(obj,start=1,width=barcode.length))),decreasing=TRUE)[1:barcodes.qty]
            }  else {
              if(length(unique(nchar(barcodes)))!=1){
                stop("We accept only barcodes having the same length.")
              }
              if(!all(nchar(barcodes) == barcode.length)){
                warning(paste("The provided barcode length was not correct. Set it to:",nchar(barcodes[1])))
                barcode.length=nchar(barcodes[1])
              }
            }

            # get the barcodes, according to a certain size
            all.barcodes <-narrow(obj,start=1,width=barcode.length)

            # calculate the edition distance
            # dist<-do.call(cbind,srdistance(barcodes,barcodes))
            # or this to speed up
            # calc the dist only on the unique barcodes
            # rather than calling unique, generate all possibilities instead
            # and map it back
            unique.barcodes <- DNAStringSet(
                                            apply(
                                                  as.matrix(eval(parse(
                                                                       text=paste(
                                                                         "expand.grid(",
                                                                         paste(rep("c('A','C','G','T','N')",barcode.length),collapse=","),
                                                                         ",stringsAsFactors=FALSE)")
                                                                       ))),1,paste,collapse=""))

            dist <- do.call(cbind,
                            srdistance(
                              unique.barcodes,
                              barcodes,
                              BPPARAM=.getBpParam(mc.cores)))[match(all.barcodes,unique.barcodes),]

            # create a selector per barcode
            sels <- lapply(barcodes,
                           function(barcode,dist,edition.dist){
                             i.sel <- colnames(dist)==barcode
                             dist[,i.sel]<=edition.dist & eval(parse(text=paste(paste("dist[,",which(!i.sel),"] > edition.dist",collapse=" & "))))
                           },dist,edition.dist)

            # validate
            if(sum(sapply(sels,sum))>length(all.barcodes)){
              warning(
                      paste(
                            "You have more reads selected: ",sapply(sels,sum),"using the edition distance:",
                            edition.dist,"than you have reads:",length(all.barcodes),"!\nUse the barcodePlot function to visually assess it."))
            }
            # select barcodes
            names(sels)<-colnames(dist)

            # either the alns or the sels are returned
            if(index.only){
              return(sels)
            } else {
              # original read length
              read.length <- width(obj)[1]

              # return a lists of aln
              alns <- lapply(barcodes,function(barcode,obj,sels){
                reads <- narrow(obj[sels[[barcode]]],start=barcode.length+1,width=read.length-barcode.length)
                bars <- narrow(obj[sels[[barcode]]],start=1,width=barcode.length)
                return(list(reads=reads,bars=bars))
              },obj,sels)
              names(alns) <- colnames(dist)
              alns <- list(reads=lapply(alns,function(x){x[[1]]}),barcodes=lapply(alns,function(x){x[[2]]}))
              return(alns)
            }
          })

setMethod(
          f="demultiplex",
          signature="ShortReadQ",
          definition=function(obj,barcodes=c(),
                              barcodes.qty=12,
                              barcode.length=6,
                              edition.dist=2,
                              type=c("independant","within"),
                              index.only=FALSE,
                              mc.cores=1L){
            return(
                   demultiplex(sread(obj),barcodes=barcodes,barcodes.qty=barcodes.qty,barcode.length=barcode.length, edition.dist=edition.dist, type=type,index.only=index.only)
                   )
          })

setMethod(
          f="barcodePlot",
          signature="AlignedRead",
          definition=function(obj,barcodes=c(),type=c("independant","within"),
                              barcode.length=6,show.barcode=20,...){

            # TODO check that we got barcodes

            # check the input
            types <- eval(formals("barcodePlot")$type)
            if(!type %in% types){
              stop(paste(
                         "The given type:",
                         type,
                         "is not part of the supported types:",
                         paste(types,collapse=", ")))
            }

            # get the barcodes
            barcodes <- switch(type,
                               independant=DNAStringSet(as.character(alignData(obj)$multiplexIndex)),
                               within=narrow(sread(obj),start=1,width=barcode.length)
                               )

            # get the frequency
            freqs <- sort(table(factor(as.character(barcodes)))/length(barcodes),decreasing=TRUE)

            # get the args
            args <- lapply(as.list(match.call())[-1], eval, parent.frame())
            args <- args[!names(args) %in% rev(names(formals("barcodePlot")))[-1]]

            # defaults
            x.lim=c(0,1)
            y.lim=c(0,show.barcode)
            p.main=""

            # upd defaults
            if(length(args)>=1){
              # check if xlim was provided
              if("xlim" %in% names(args)){
                x.lim=rev(1-args$xlim)
              }

              # check if ylim was provided
              if("ylim" %in% names(args)){
                y.lim=args$ylim
              }

              # check if main was provided
              if("main" %in% names(args)){
                p.main=args$main
              }
            }

            # get remaining args
            args <- args[!names(args)%in%c("xlim","ylim","main")]

            # plot
            if(length(args)>=1){
              eval(parse(text=paste("plot(0,0,bty='n',type='n',axes=FALSE,xlab='',ylab='',xlim=x.lim,ylim=y.lim,",paste(names(args),args,sep="=",collapse=","),")")))
            } else {
              plot(0,0,bty='n',type='n',axes=FALSE,xlab='',ylab='',xlim=x.lim,ylim=y.lim)
            }
            rect(c(1-freqs[1:show.barcode]),c((show.barcode-1):0),rep(1,show.barcode),c(show.barcode:1),col=ifelse(names(freqs)[1:show.barcode] %in% barcodes,"orange","black"))
            axis(2,labels=names(freqs)[1:show.barcode],at=seq(show.barcode-0.5,0.5,-1),las=2,tick=FALSE,line = -2)
            axis(3,labels=rev(seq(0,1,0.5)),at=seq(0,1,0.5))
            mtext("barcodes frequency",side=3,at=0.5,line=3)
            mtext(p.main,side=1,cex=par("cex.main"))

            # return
            invisible(freqs)
          })

# TODO merge that with the above one
# make sure that we have a proper cex for the y axis
setMethod(
          f="barcodePlot",
          signature="DNAStringSet",
          definition=function(obj,barcodes=c(),type=c("independant","within"),
                              barcode.length=6,show.barcode=20,...){

            # TODO check that we got barcodes

            # check the input
            types <- eval(formals("barcodePlot")$type)
            if(!type %in% types){
              stop(paste(
                         "The given type:",
                         type,
                         "is not part of the supported types:",
                         paste(types,collapse=", ")))
            }

            # get the barcodes
            barcodes <- narrow(obj,start=1,width=barcode.length)

            # get the frequency
            freqs <- sort(table(factor(as.character(barcodes)))/length(barcodes),decreasing=TRUE)

            # get the args
            args <- lapply(as.list(match.call())[-1], eval, parent.frame())
            args <- args[!names(args) %in% rev(names(formals("barcodePlot")))[-1]]

            # defaults
            x.lim=c(0,1)
            y.lim=c(0,show.barcode)
            p.main=""

            # upd defaults
            if(length(args)>=1){
              # check if xlim was provided
              if("xlim" %in% names(args)){
                x.lim=rev(1-args$xlim)
              }

              # check if ylim was provided
              if("ylim" %in% names(args)){
                y.lim=args$ylim
              }

              # check if main was provided
              if("main" %in% names(args)){
                p.main=args$main
              }
            }

            # get remaining args
            args <- args[!names(args)%in%c("xlim","ylim","main")]

            # plot
            if(length(args)>=1){
              eval(parse(text=paste("plot(0,0,bty='n',type='n',axes=FALSE,xlab='',ylab='',xlim=x.lim,ylim=y.lim,",paste(names(args),args,sep="=",collapse=","),")")))
            } else {
              plot(0,0,bty='n',type='n',axes=FALSE,xlab='',ylab='',xlim=x.lim,ylim=y.lim)
            }
            rect(c(1-freqs[1:show.barcode]),c((show.barcode-1):0),rep(1,show.barcode),c(show.barcode:1),col=ifelse(names(freqs)[1:show.barcode] %in% barcodes,"orange","black"))
            axis(2,labels=names(freqs)[1:show.barcode],at=seq(show.barcode-0.5,0.5,-1),las=2,tick=FALSE,line = -2,cex.lab=0.8)
            axis(3,labels=rev(seq(0,1,0.5)),at=seq(0,1,0.5))
            mtext("barcodes frequency",side=3,at=0.5,line=3)
            mtext(p.main,side=1,cex=par("cex.main"))

            # return
            invisible(freqs)
          })

# for ShortReadQ
# think of the same for othe ShortRead obj and maybe a more generic function...
setMethod(
          f="barcodePlot",
          signature="ShortReadQ",
          definition=function(obj,barcodes=c(),type=c("independant","within"),
                              barcode.length=6,show.barcode=20,...){
            return(
                   barcodePlot(sread(obj),barcodes=barcodes,type=type,
                               barcode.length=barcode.length,
                               show.barcode=show.barcode,...)
                   )
          })


# additional filters
#' @export chastityFilter
chastityFilter <- function(.name="Illumina Chastity Filter")
{
  srFilter(function(x){
    if(any(rownames(varMetadata(alignData(x))) == "filtering")){
      keep<-alignData(x)$filtering=="Y"
    } else {
      warning(paste("The '",.name,"' filter is only valid for Illumina reads.",sep=""))
      keep<-rep(TRUE,length(x))
    }
    return(keep)
  },name=.name)
}

#' @export naPositionFilter
naPositionFilter <- function(.name="NA Position Filter"){
  srFilter(function(x){
    !is.na(position(x))
  },name=.name)
}

# an internal function
".getBpParam" <- function(mc.cores=1L){
  return(switch(as.character(mc.cores),
                "1"=SerialParam(),
                MulticoreParam(workers=mc.cores)))
}

Try the easyRNASeq package in your browser

Any scripts or data that you put into this service are public.

easyRNASeq documentation built on April 30, 2020, 2 a.m.