R/samFilters.R

Defines functions pairedReads samReadFilter samReadCount samReadMatch pairIsAligned pairIsPrimary pairHasRef pairIsChimeric pairIsNotChimeric areReadFlagsSet areReadFlagsUnset areReadFlags readHasRef readMateHasRef

Documented in areReadFlags areReadFlagsSet areReadFlagsUnset pairedReads pairHasRef pairIsAligned pairIsChimeric pairIsNotChimeric pairIsPrimary readHasRef readMateHasRef samReadCount samReadFilter samReadMatch

#' Filter paired or unpaired reads
#'
#' Drop all paired or unpaired reads. Relies on the accuracy and disjoint nature
#' of the FIRST_OF_PAIR and SECOND_OF_PAIR flags, and that each paired-end read
#' has a unique id (qname). Maintains multiple alignments for any kept read
#' ends.
#'
#' This allows dropping unpaired reads after filtering that might only drop one
#' read of a pair, such as filtering by quality.
#'
#' @param sam The Sam object to filter.
#'
#' @param paired Set \code{FALSE} to keep only unpaired reads. Keeps
#' only paired reads by default.
#'
#' @return A sam object with only paired reads, or with only unpaired reads.
#'
#' @examples
#' \dontrun{
#' sam <- Sam("someFile")
#' pairedOnly <- pairedReads(sam)
#' unpairedOnly <- pairedReads(sam, FALSE)
#' }
#'
#' @export
pairedReads <- function(sam, paired= TRUE) {
	# as.logical(bitwAnd(sam$reads$flag, 64)) is true for all first reads in pair
	#  (for each alignment if more than one). 128 is true for second.
	if (paired) {
		sam$reads <- sam$reads[
			sam$reads$qname %in% sam$reads$qname[as.logical(bitwAnd(sam$reads$flag, 64))] &
			sam$reads$qname %in% sam$reads$qname[as.logical(bitwAnd(sam$reads$flag, 128))], ]
	}
	else {
		sam$reads <- sam$reads[
			! sam$reads$qname %in% sam$reads$qname[as.logical(bitwAnd(sam$reads$flag,  64))] |
			! sam$reads$qname %in% sam$reads$qname[as.logical(bitwAnd(sam$reads$flag, 128))], ]
	}
	sam
}

#' Filter a Sam object
#'
#' Apply a filter to a sam object, returing a new sam object with the sam header
#' but only containing the reads for which the filter \code{FUNC} returns
#' \code{TRUE}. Uses \code{\link[plyr]{adply}} to apply this filter to each read
#' alignment. Many of the parameters are just pass-through parameters for
#' \code{\link[plyr]{adply}}. These all begin with a "\code{.}" to distinguish
#' them from potential parameters to the function as they come after the
#' \code{...} in the function definition. No attempt is made to keep or delete
#' both ends of a paired end read if one end passes and one end fails the
#' filter.
#'
#' NOTE: Does not add a @PG header line nor modify any PG:Z tag on any read.
#'
#' @param sam A Sam object
#' @param FUNC A function whose first parameter is a single read and that
#'   returns a single \code{TRUE} if that read should be kept when filtering the
#'   \code{sam} object, or \code{FALSE} if not. Should never return \code{NULL}
#'   or \code{NA}.
#' @param ... Other parameters to pass to \code{FUNC}.
#' @param .progress Progress bar; see \code{\link[plyr]{adply}}.
#' @param .inform Extra error handling; see \code{\link[plyr]{adply}}.
#' @param .parallel Use the parallel package; see \code{\link[plyr]{adply}}.
#' @param .paropts Options if using the parallel package; see \code{\link[plyr]{adply}}.
#' @param .id Id column; see \code{\link[plyr]{adply}}.
#'
#' @return A sam object with only the reads that passed the filter \code{FUNC}.
#'
#' @seealso Sam, readIsChimeric,
#'
#' @examples
#' \dontrun{
#' sam <- Sam("someFile")
#' hivIntegrationSam <- samReadFilter( sam, "readIsOnRef", "^chrMT$" )
#' }
#'
#' @export
samReadFilter <- function( sam, FUNC, ...,
	.progress= "none", .inform= FALSE, .parallel= FALSE, .paropts= NULL, .id= NA
) {
	sam$reads <- sam$reads[ samReadMatch( sam, FUNC, ...,
			.progress= .progress, .inform= .inform, .parallel= .parallel,
			.paropts= .paropts, .id= .id ), ]
	sam
}

# Old implementation, save until new implementation tested, then delete!
######
# samReadFilter <- function( sam, FUNC, ...,
# 	.progress= "none", .inform= FALSE, .parallel= FALSE, .paropts= NULL, .id= NA
# ) {
# 	sam$reads <- sam$reads[plyr::aaply(
# 		.data= sam$reads, .margins= 1, .fun= FUNC, ..., .expand= FALSE,
# 		.progress= .progress, .inform= .inform, .parallel= .parallel,
# 		.paropts= .paropts
# 	), ]
# 	sam
# }
######

#' Count reads meeting filter criteria
#'
#' Apply a filter to a sam object and count the reads for which the filter
#' \code{FUNC} returns \code{TRUE}. Uses \code{\link[plyr]{adply}} to apply this
#' filter to each read alignment. Many of the parameters are just pass-through
#' parameters for \code{\link[plyr]{adply}}. These all begin with a "\code{.}"
#' to distinguish them from potential parameters to the function as they come
#' after the \code{...} in the function definition. No attempt is made to keep
#' or delete both ends of a paired end read if one end passes and one end fails
#' the filter.
#'
#' @param sam A Sam object
#' @param FUNC A function whose first parameter is a single read and that
#'   returns a single \code{TRUE} if that read should be kept when filtering the
#'   \code{sam} object, or \code{FALSE} if not. Should never return \code{NULL}
#'   or \code{NA}.
#' @param ... Other parameters to pass to \code{FUNC}.
#' @param .progress Progress bar; see \code{\link[plyr]{adply}}.
#' @param .inform Extra error handling; see \code{\link[plyr]{adply}}.
#' @param .parallel Use the parallel package; see \code{\link[plyr]{adply}}.
#' @param .paropts Options if using the parallel package; see \code{\link[plyr]{adply}}.
#' @param .id Id column; see \code{\link[plyr]{adply}}.
#'
#' @return The count of matching reads
#'
#' @seealso samReadFilter
#'
#' @examples
#' \dontrun{
#' sam <- Sam("someFile")
#' hivIntegrationCount <- samReadCount( sam, "readIsOnRef", "^chrMT$" )
#' }
#'
#' @export
samReadCount <- function( sam, FUNC, ...,
	.progress= "none", .inform= FALSE, .parallel= FALSE, .paropts= NULL, .id= NA
) {
	sum( samReadMatch( sam, FUNC, ...,
	    .progress= .progress, .inform= .inform, .parallel= .parallel,
	    .paropts= .paropts, .id= .id ))
}

#' Identify reads matching an applied filter.
#'
#' Apply a filter to a sam object, returning a logical vector specifying which
#' reads match the filter, i.e. the result of the filter \code{FUNC} for each
#' read. Uses \code{\link[plyr]{adply}} to apply this filter to each read
#' alignment. Many of the parameters are just pass-through parameters for
#' \code{\link[plyr]{adply}}. These all begin with a "\code{.}" to distinguish
#' them from potential parameters to the function as they come after the
#' \code{...} in the function definition. No attempt is made to keep or delete
#' both ends of a paired end read if one end passes and one end fails the
#' filter.
#'
#' @param sam A Sam object
#' @param FUNC A function whose first parameter is a single read and that
#'   returns a single \code{TRUE} if that read should be kept when filtering the
#'   \code{sam} object, or \code{FALSE} if not. Should never return \code{NULL}
#'   or \code{NA}.
#' @param ... Other parameters to pass to \code{FUNC}.
#' @param .progress Progress bar; see \code{\link[plyr]{adply}}.
#' @param .inform Extra error handling; see \code{\link[plyr]{adply}}.
#' @param .parallel Use the parallel package; see \code{\link[plyr]{adply}}.
#' @param .paropts Options if using the parallel package; see \code{\link[plyr]{adply}}.
#' @param .id Id column; see \code{\link[plyr]{adply}}.
#'
#' @return A logical vector with \code{TRUE} for each read that passed the
#' filter \code{FUNC} and \code{FALSE} otherwise.
#'
#' @seealso Sam, readIsChimeric,
#'
#' @examples
#' \dontrun{
#' sam <- Sam("someFile")
#' hivIntegrationSelect <- samReadFilter( sam, "readIsOnRef", "^chrMT$" )
#' }
#'
#' @export
samReadMatch <- function( sam, FUNC, ...,
	.progress= "none", .inform= FALSE, .parallel= FALSE, .paropts= NULL, .id= NA
) {
	selectionVec <- plyr::aaply(
		.data= sam$reads, .margins= 1, .fun= FUNC, ..., .expand= FALSE,
		.progress= .progress, .inform= .inform, .parallel= .parallel,
		.paropts= .paropts
	)
	names(selectionVec) <- NULL
	selectionVec
}

#' Paired-end read tests
#'
#' Tests a read to see if its meets some criteria. Only
#' tests that keep both or delete both reads when applied to each read end
#' separately are paired-end read tests. These tests are suitable for use in a
#' \code{\link{samReadFilter}} and will not result in unpaired reads after
#' application.
#'
#' @param read The read to test.
#'
#' @param ref Regular expression matched against the reference that this end
#'   and/or the other end aligns to. Keeps a read if \code{ref} matches to
#'   either (\code{rname}) and/or (\code{rnext}).
#' @param altRef Regular expression that matches the chromosome/reference name of
#'   whichever end \code{ref1} does not match.
#'
#' @return Returns \code{TRUE} if a read passes the test, \code{FALSE}
#'   otherwise. Each read from the same template will return the same result.
#'
#' @examples
#' \dontrun{
#' sam <- Sam("someFile")
#' aRead <- SamReads(sam)[1]
#'
#' pairHasRef( aRead, "^chrX$|^chrY$" )
#' sexLinked <- samReadFilter( sam, "pairHasRef", "^chrX$|^chrY$" )
#'
#' pairIsChimeric(aRead, "^chr..?$", "hiv|HIV")
#' hivIntegrationSam <- samReadFilter( sam, "pairIsChimeric", "^chr..?$", "hiv|HIV" )
#'
#' pairIsNotChimeric(aRead, "hiv|HIV")
#' hivOnly <- samReadFilter( sam, "pairIsNotChimeric", "hiv|HIV" )
#'
#' }
#' @name pairedEndReadTests
NULL
#NULL

#' @describeIn pairedEndReadTests selects paired end alignments when both ends
#' are aligned. Relies on accuracy of READ_UNMAPPED and MATE_UNMAPPED flags and
#' does not consider any other indicators. This will select non-primary
#' alignments as well as primary alignments, if included in the sam file.
#'
#' @export
pairIsAligned <- function(read) {
	allUnsetSamFlags(read$flag, c("READ_UNMAPPED", "MATE_UNMAPPED"))
}

#' @describeIn pairedEndReadTests selects paired end reads when both ends
#' are aligned and are the primary alignment. Relies on accuracy of the
#' READ_UNMAPPED, MATE_UNMAPPED, and NOT_PRIMARY_ALIGNMENT flags.
#'
#' @export
pairIsPrimary <- function(read) {
	allUnsetSamFlags(read$flag, c("READ_UNMAPPED", "MATE_UNMAPPED", "NOT_PRIMARY_ALIGNMENT"))
}

#' @describeIn pairedEndReadTests Identifies paired end reads when one end is aligned
#'   to a selected reference or chromosome. Takes a regular expression and
#'   will return TRUE if either a reads \code{rnext} or \code{rname} (or both)
#'   matches it.
#'
#' @export
pairHasRef <- function(read, ref ) {
	return( grepl( ref, read$rname ) || grepl( ref, read$rnext ))
}

#' @describeIn pairedEndReadTests Identifies paired end reads when one end is
#'   aligned to one reference or chromosome and the other end to a different
#'   one. Takes one or two regular expressions, and will return TRUE only if one read
#'   end matches the first expression and the other read end does not, or if \code{altRef}
#'   is specified if the other end matches that.
#'
#' @export
pairIsChimeric <- function(read, ref, altRef=NULL ) {
	if (is.null(altRef)) {
		(grepl( ref, read$rname ) && ! grepl( ref, read$rnext )) ||
		(grepl( ref, read$rnext ) && ! grepl( ref, read$rname ))

	}
	else {
		(grepl( ref, read$rname ) && grepl( altRef, read$rnext )) ||
		(grepl( altRef, read$rname ) && grepl( ref, read$rnext ))
	}
}

#' @describeIn pairedEndReadTests Identifies paired end reads when both ends are
#'   aligned to the same reference or chromosome and the other end to a different
#'   one. Takes a regular expression, and will return TRUE only if both a reads
#'   \code{rnext} and \code{rname} match it.
#'
#' @export
pairIsNotChimeric <- function(read, ref) {
	return( grepl( ref, read$rname ) && grepl( ref, read$rnext ))
}

#' Read tests
#'
#' Tests a read to see if its meets some criteria. May return \code{TRUE} for
#' only one end of a paired end read. These tests are suitable for use in a
#' \code{\link{samReadFilter}} but can result in unpaired reads after
#' application; \code{pairedReads} should probably be run after running any
#' of these as filters.
#'
#' @param read The read to test.
#' @param ref Regular expression matched against the reference aligned to.
#' Keeps a read if \code{ref} matches to the reference this read aligns to
#' (\code{readHasRef()}, matching against \code{rname}) or the reference the
#' mate of this read aligns to (\code{readMateHasRef()}, matching against
#' \code{rnext}).
#' @param flags A character vector of one or more flag names to test against
#' the flags set or unset for a read.
#' @param flagVec A logical vector where the names are sam flags and the values
#' indicate if that flag shoud be set (\code{TRUE}) or unset (\code{FLAG}). If
#' the value of a flag does not matter, it should not be included. NA should not
#' be used.
#'
#' @return Returns \code{TRUE} if a read passes the test, \code{FALSE} otherwise.
#' Different ends of the same template will return the same result.
#'
#' @examples
#' \dontrun{
#' sam <- Sam("someFile")
#' aRead <- SamReads(sam)[1]
#'
#' areReadFlagsUnset( aRead, "NOT_PRIMARY_ALIGNMENT" )
#' primaryReads <- samReadFilter( sam, "areReadFlagsUnset", "NOT_PRIMARY_ALIGNMENT" )
#'
#' areReadFlagsSet( aRead, "NOT_PRIMARY_ALIGNMENT" )
#' secondaryReads <- samReadFilter( sam, "areReadFlagsSet", "NOT_PRIMARY_ALIGNMENT" )
#'
#' goodAlignmentFlagVec <- c(
#' 	"READ_PAIRED"           =  TRUE, "PROPER_PAIR"             =  TRUE,
#'    "READ_UNMAPPED"         = FALSE, "MATE_UNMAPPED"           = FALSE,
#'    "NOT_PRIMARY_ALIGNMENT" = FALSE, "READ_FAILS_VENDOR_QC"    = FALSE,
#'    "DUPLICATE_READ"        = FALSE, "SUPPLEMENTARY_ALIGNMENT" = FALSE
#' )
#' areReadFlags( aRead, goodAlignmentFlagVec )
#' goodReads <- samReadFilter( sam, "areReadFlags", goodAlignmentFlagVec )
#' }
#' @name readTests
NULL
#NULL

#' @describeIn readTests Returns \code{TRUE} if all the specified flags are set for
#' the read. Equivalent to \code{\link{allSetSamFlags}(read$flag, flags)}
#'
#' @export
areReadFlagsSet <- function(read, flags) {
	allSetSamFlags(read$flag, flags)
}

#' @describeIn readTests Returns \code{TRUE} if all the specified flags are set for
#' the read. Equivalent to \code{\link{allUnsetSamFlags}(read$flag, flags)}
#'
#' @export
areReadFlagsUnset <- function(read, flags) {
	allUnsetSamFlags(read$flag, flags)
}

#' @describeIn readTests Returns \code{TRUE} if all the flags that set in the
#'   flagVec are set for the read and all the flags that are unset in the
#'   flagVec are unset in the read. If all flags in the flagVec checked are set
#'   or all are unset, it is more efficient to use \code{areReadFlagsSet} or
#'   \code{areReadFlagsUnset}.
#'
#' @export
areReadFlags <- function( read, flagVec ) {
	matchSamFlags( read$flag, flagVec )
}

#' @describeIn readTests Returns \code{TRUE} if the read is aligned to a
#'   reference identified by the specified regular expressions (matched against
#'   the reads \code{rname}).
#'
#' @export
readHasRef <- function( read, ref ) {
	grepl( ref, read$rname )
}

#' @describeIn readTests Returns \code{TRUE} if the read's mate (other paired
#'   end) is aligned to a  reference identified by the specified regular
#'   expressions (matched against the reads \code{rnext}).
#'
#' @export
readMateHasRef <- function( read, ref ) {
	grepl( ref, read$rnext )
}
jefferys/SamSeq documentation built on Oct. 25, 2019, 8:11 a.m.