R/initialisers.R

Defines functions .checkReadsToTargets .checkForPaired separateChimeras alnsToCrisprSet readTargetBam rcAlns collapsePairs

Documented in collapsePairs rcAlns readTargetBam

#'@title Trims reads to a target region.
#'@description Trims aligned reads to one or several target regions,
#'optionally reverse complementing the alignments.
#'@param reads A GAlignments object, or a character vector of the filenames
#'@param target A GRanges object specifying the range to narrow alignments to
#'@author Helen Lindsay
#'@rdname readsToTarget
#'
#'@import methods
#'@import BiocParallel
#'@import Biostrings
#'@import ggplot2
#'@import GenomicAlignments
#'@import GenomicRanges
#'@import IRanges
#'@import Rsamtools
#'@importFrom grid gpar grid.rect
#'@importFrom gridExtra grid.arrange arrangeGrob
#'@importFrom S4Vectors elementNROWS remapHits na.omit queryHits subjectHits
#'@importFrom reshape2 melt
#'@importFrom AnnotationDbi select
#'@importFrom GenomeInfoDb seqlengths
#'@export
setGeneric("readsToTarget", function(reads, target, ...) {
  standardGeneric("readsToTarget")})


#'@param name An experiment name for the reads.  (Default: NULL)
#'@param reverse.complement Should the alignments be oriented to match
#'the strand of the target? (Default: TRUE)
#'@param collapse.pairs  If reads are paired, should pairs be collapsed?  (Default: FALSE)
#'Note: only collapses primary alignments, and assumes that there is only one primary
#'alignment per read.  May fail with blat alignments converted to bam.
#'@param use.consensus Take the consensus sequence for non-matching pairs? If FALSE,
#'the sequence of the first read is used.  Can be very slow. (Default: FALSE)
#'@param store.chimeras Should chimeric reads be stored?  (Default: FALSE)
#'@param verbose Print progress and statistics (Default: TRUE)
#'@return (signature("GAlignments", "GRanges")) A \code{\link{CrisprRun}} object
#'@examples
#'# Load the metadata table
#'md_fname <- system.file("extdata", "gol_F1_metadata_small.txt", package = "CrispRVariants")
#'md <- read.table(md_fname, sep = "\t", stringsAsFactors = FALSE)
#'
#'# Get bam filenames and their full paths
#'bam_fnames <- sapply(md$bam.filename, function(fn){
#'  system.file("extdata", fn, package = "CrispRVariants")})
#'
#'reference <- Biostrings::DNAString("GGTCTCTCGCAGGATGTTGCTGG")
#'gd <- GenomicRanges::GRanges("18", IRanges::IRanges(4647377, 4647399),
#'        strand = "+")
#'
#'crispr_set <- readsToTarget(bam_fnames, target = gd, reference = reference,
#'                            names = md$experiment.name, target.loc = 17)
#'
#'@rdname readsToTarget
setMethod("readsToTarget", signature("GAlignments", "GRanges"),
          function(reads, target, ..., reverse.complement = TRUE,
                   chimeras = NULL, collapse.pairs = FALSE, use.consensus = FALSE,
                   store.chimeras = FALSE, verbose = TRUE, name = NULL){

            if (length(target) > 1){
              stop("readsToTarget accepts a single target range")
            }

            dots <- list(...)
            keep.unpaired <- TRUE
            if ("keep.unpaired" %in% names(dots)){
              keep.unpaired <- dots["keep.unpaired"]
            }

            if (length(reads) == 0){
              if (length(chimeras) == 0) {return(NULL)}
              crun <- CrisprRun(reads, target, IRanges::IRangesList(), rc = rc,
                                name = name, chimeras = chimeras, verbose = verbose)
              return(crun)
            }

            if (isTRUE(collapse.pairs)){
              if (is.null(names(reads)) |  ! ("flag" %in% names(mcols(reads))) ){
                stop("Reads must include names and bam flags for collapsing pairs")
              }
            }

            if (is.null(chimeras)) {
              chimeras <- GenomicAlignments::GAlignments()
              if (isTRUE(store.chimeras)) {
                # There is no way to specify tolerance here, or to use target.loc
                temp <- separateChimeras(reads, target, by.flag = collapse.pairs,
                                         verbose = verbose)
                reads <- temp$bam
                chimeras <- temp$chimeras[[1]]
              }
            }

            # Filter out reads that don't span the target region
            # Not using findOverlaps because reads may be paired, i.e. names nonunique
            bam <- reads[start(reads) <= start(target) & end(reads) >= end(target) &
                         seqnames(reads) == as.character(seqnames(target))]

            if (isTRUE(verbose)){
              message(sprintf("%s of %s nonchimeric reads span the target range\n",
                          length(bam), length(reads)))
            }

            # If bam and or chimeras are empty, no further calculation needed
            if (length(bam) == 0 & length(chimeras) == 0) return(NULL)

            if (length(bam) == 0){
              crun <- CrisprRun(bam, target, IRanges::IRangesList(), rc = rc,
                          name = name, chimeras = chimeras, verbose = verbose)
              return(crun)
            }

            # If bam is non-empty, orient and narrow the reads to the target
            if (reverse.complement & as.character(strand(target)) == "*"){
             message(paste0("Target does not have a strand, but reverse.complement is TRUE. ",
                            "Orienting reads to reference strand."))
              rc = FALSE
            }else{
              rc <- rcAlns(as.character(strand(target)), reverse.complement)
            }

            # narrow aligned reads
            result <- narrowAlignments(bam, target, reverse.complement = rc,
                                       verbose = verbose)
            gen_ranges <- GenomicAlignments::cigarRangesAlongReferenceSpace(
                             cigar(result), pos = start(result))

            # Collapse pairs of narrowed reads

            if (isTRUE(collapse.pairs)){
              result <- collapsePairs(result, genome.ranges = gen_ranges,
                                      use.consensus = use.consensus,
                                      keep.unpaired = keep.unpaired, verbose = verbose)

              result <- result$alignments
            }

            if (length(result) == 0){
              if (length(chimeras) == 0){
                return(NULL)
              }
              crun <- CrisprRun(result, target, IRanges::IRangesList(), rc = rc,
                                name = name, chimeras = chimeras, verbose = verbose)
              return(crun)
            }

            crun <- CrisprRun(result, target, gen_ranges, rc = rc, name = name,
                              chimeras = chimeras, verbose = verbose)
            crun
          })



#'@rdname readsToTarget
setMethod("readsToTarget", signature("GAlignmentsList", "GRanges"),
          function(reads, target, ..., reference = reference,
                   names = NULL, reverse.complement = TRUE, target.loc = 17,
                   chimeras = NULL, collapse.pairs = FALSE, use.consensus = FALSE,
                   verbose = TRUE){

    # Deal with potentially empty chimeras
    # check that lengths of reads and chimeras are either 0 or the same
    nreads <- length(reads)
    nch <- length(chimeras)
    if (nreads > 0 & nch > 0 & ! nreads == nch){
      stop("Chimeras must be either NULL or a GAlignmentsList of length equal to reads")
    }

    cset <- alnsToCrisprSet(reads, reference, target, reverse.complement,
                            collapse.pairs, names, use.consensus, target.loc,
                            verbose, chimeras = chimeras,  ...)
    cset
})



#'@param names Experiment names for each bam file.  If not supplied, filenames are used.
#'@param chimeras Flag to determine how chimeric reads are treated.  One of
#'"ignore", "exclude", and "merge".  Default "count", "merge" not implemented yet
#'@param reference The reference sequence
#'@param exclude.ranges Ranges to exclude from consideration, e.g. homologous to a pcr primer.
#'@param exclude.names Alignment names to exclude
#'@param ... Extra arguments for initialising CrisprSet
#'@return (signature("character", "GRanges")) A \code{\link{CrisprSet}} object
#'@rdname readsToTarget
setMethod("readsToTarget", signature("character", "GRanges"),
          function(reads, target, ..., reference, reverse.complement = TRUE,
                   target.loc = 17, exclude.ranges = GRanges(), exclude.names = NA,
                   chimeras = c("count","exclude","ignore", "merge"),
                   collapse.pairs = FALSE, use.consensus = FALSE,
                   names = NULL, verbose = TRUE){

            # Make sure the reference sequence consists of one sequence
            # Coerce to "DNA string if necessary"
            if (class(reference) == "character" | class(reference) == "DNAStringSet"){
              if (length(reference) > 1){
                stop("Reference should be a single sequence, as a DNAString")
              }
              reference <- as(reference[[1]], "DNAString")
            }
            args <- list(...)
            c_to_t <- 5
            if ("chimera.to.target" %in% names(args)){
              c_to_t <- args[["chimera.to.target"]]
            }

            chimeras <- match.arg(chimeras)

            temp <- lapply(reads, readTargetBam, target = target,
                           exclude.ranges = exclude.ranges,
                           exclude.names = exclude.names,
                           chimeras = chimeras, by.flag = collapse.pairs,
                           chimera.to.target = c_to_t,
                           verbose = verbose)

            alns <- lapply(temp, "[[", "bam")
            chimeras <- lapply(temp, "[[", "chimeras")

            # If names are not specified, set them to the filenames
            if (is.null(names)){
              names <- reads
            }
            names <- as.character(names)

            cset <- alnsToCrisprSet(alns, reference, target, reverse.complement,
                                    collapse.pairs, names, use.consensus, target.loc,
                                    verbose = verbose, chimeras = chimeras, ...)

            cset
          })


#'@export
#'@rdname readsToTarget
setGeneric("readsToTargets", function(reads, targets, ...) {
  standardGeneric("readsToTargets")})

#'@param targets A set of targets to narrow reads to
#'@param references A set of reference sequences matching the targets.
#'References for negative strand targets should be on the negative strand.
#'@param primer.ranges A set of GRanges, corresponding to the targets.
#'Read lengths are typically greater than target regions, and it can
#'be that reads span multiple targets.  If primer.ranges are available,
#'they can be used to assign such reads to the correct target.
#'@param target.loc The zero point for renumbering (Default: 17)
#'@param ignore.strand Should strand be considered when finding overlaps?
#'(See \code{\link[GenomicAlignments]{findOverlaps}} )
#'@param bpparam A BiocParallel parameter for parallelising across reads.
#'Default: no parallelisation.  (See \code{\link[BiocParallel]{bpparam}})
#'@param chimera.to.target Number of bases that may separate a chimeric read
#'set from the target.loc for it to be assigned to the target. (Default: 5)
#'@rdname readsToTarget
setMethod("readsToTargets", signature("character", "GRanges"),
          function(reads, targets, ..., references, primer.ranges = NULL,
                   target.loc = 17, reverse.complement = TRUE, collapse.pairs = FALSE,
                   use.consensus = FALSE, ignore.strand = TRUE,
                   names = NULL, bpparam = BiocParallel::SerialParam(),
                   chimera.to.target = 5, verbose = TRUE){

            dummy <- .checkReadsToTargets(targets, primer.ranges, references)

            if (is.null(names)){
              names <- reads
            }

            param <- Rsamtools::ScanBamParam(what = c("seq", "flag"))
            args <- list(...)
            ntargets <- length(targets)

            bams <- BiocParallel::bplapply(seq_along(reads), function(i){
              if (verbose) message(sprintf("Loading alignments for %s\n\n",
                                       names[i]))
              bam <- GenomicAlignments::readGAlignments(reads[i],
                                              param = param, use.names = TRUE)
              if (length(bam) == 0){
                if (verbose) message("No reads in alignment\n")
                return(NULL)
              }
              return(bam)
           }, BPPARAM = bpparam)

           bams <- GAlignmentsList(bams)
           if (collapse.pairs == FALSE) dummy <- .checkForPaired(bams)

           result <- readsToTargets(bams, targets, references = references,
                       target.loc = target.loc, verbose = verbose,
                       reverse.complement = reverse.complement,
                       ignore.strand = ignore.strand,
                       collapse.pairs = collapse.pairs, names = names,
                       bpparam = bpparam, use.consensus = use.consensus,
                       chimera.to.target = chimera.to.target)
          result

          })


#'@rdname readsToTarget
setMethod("readsToTargets", signature("GAlignmentsList", "GRanges"),
          function(reads, targets, ..., references, primer.ranges = NULL,
                   target.loc = 17, reverse.complement = TRUE, collapse.pairs = FALSE,
                   use.consensus = FALSE, ignore.strand = TRUE,
                   names = NULL, bpparam = BiocParallel::SerialParam(),
                   chimera.to.target = 5, verbose = TRUE){

    dummy <- .checkReadsToTargets(targets, primer.ranges, references)
    if (collapse.pairs == FALSE) dummy <- .checkForPaired(reads)

    if (is.null(names)){
      if (! is.null(names(reads))){
        names <- names(reads)
      }else{
        names <- sprintf("Sample %s",seq_along(reads))
      }
    }

    byPCR <- BiocParallel::bplapply(reads, function(bam){

      if (verbose) message(sprintf("Assigning chimeric reads to targets \n"))

      # Change to default: just supply the cut site (target.loc)
      ch_tgts <- resize(resize(targets, target.loc + 1, fix="start"), 2, fix = "end")

      temp <- separateChimeras(bam, ch_tgts, chimera.to.target,
                              by.flag = collapse.pairs, verbose = verbose)
      bam <- temp$bam
      chimerasByPCR <- temp$chimeras

      # If primer.ranges are provided, match reads to primers
      # If not, match reads to targets
      if (! is.null(primer.ranges)){
        hits <- readsByPCRPrimer(bam, primer.ranges, verbose = verbose)
        splits <- split(queryHits(hits), subjectHits(hits))
      } else{
        hits <- findOverlaps(targets, bam, type = "within", ignore.strand = TRUE)
        duplicates <- (duplicated(subjectHits(hits)) |
                       duplicated(subjectHits(hits), fromLast = TRUE))
        if (verbose){
          msg <- paste0("%s (%.2f%%) reads of %s overlap a target\n",
                        "  %s (%.2f%%) of these overlapping multiple targets removed\n",
                        "  %s (%.2f%%) reads mapped to a single target\n\n")
          rhits <- length(unique(subjectHits(hits)))
          bl <- length(bam)
          ndups <- sum(duplicated(subjectHits(hits)))
          nndups <- sum(!duplicates)
          message(sprintf(msg, rhits, rhits/bl*100, bl, ndups, ndups/rhits*100,
                      nndups, nndups/bl*100))
        }
        hits <- hits[!duplicates]
        splits <- split(subjectHits(hits), queryHits(hits))
      }
      bamByPCR <- as.list(rep(NA, length(targets)))
      names(bamByPCR) <- seq_along(targets)
      for (nm in names(splits)){
        bamByPCR[nm] <- bam[splits[[nm]]]
      }
      byPCR <- list(bamByPCR = bamByPCR, chimerasByPCR = chimerasByPCR)
      byPCR
    }, BPPARAM = bpparam)

    # Remove any empty alignments
    to_keep <- which(lapply(byPCR, length) > 0)
    byPCR <- byPCR[to_keep]
    names <- names[to_keep]
    if (length(names) == 0){
      stop("No files contain on target reads")
    }

    # Reformat to list by guides instead of samples
    tlist <- function(i) {
      lapply(temp, "[[", i)
    }
    temp <- lapply(byPCR, "[[", "chimerasByPCR")
    chimerasByPCR <- lapply(seq_along(temp[[1]]), tlist)
    temp <- lapply(byPCR, "[[", "bamByPCR")
    bamByPCR <- lapply(seq_along(temp[[1]]), tlist)
    tg_gr <- GRangesList(as.list(targets))

    dummy <- gc()

    result <- BiocParallel::bplapply(seq_along(bamByPCR), function(i){
      bams <- bamByPCR[[i]]
      tgt <- tg_gr[[i]]
      chs <- chimerasByPCR[[i]]
      ref <- references[i]
      if (isTRUE(verbose)){
        message(sprintf("\n\nWorking on target %s\n", names(tgt)))
      }

      cset <- alnsToCrisprSet(bams, ref, tgt, reverse.complement,
                              collapse.pairs, names, use.consensus, target.loc,
                              verbose, chimeras = chs,  ...)
    }, BPPARAM = bpparam)

    if (length(result) == 0){
      warning("No reads span a target")
      return(result)
    }

    if (!is.null(names(targets))) {
      names(result) <- names(targets)
    }

    result <- result[!sapply(result, is.null)]
    result
  })


.checkReadsToTargets <- function(targets, primer.ranges, references){
  if (length(primer.ranges) > 0 & length(primer.ranges) != length(targets)){
    stop("primer.ranges should be the amplified regions, one per target")
  }
  if (length(targets) != length(references)){
    stop("A reference sequence for each target must be provided")
  }
  TRUE
}

.checkForPaired <- function(bams){
  is_first <- function(bams){ bitwAnd(mcols(bams)$flag, 64) }
  if ("flag" %in% names(mcols(bams))){
    if (any(is_first(bams))){
      warning("collapse.pairs set to FALSE but flags indicate paired reads",
              immediate. = TRUE)
      return(FALSE)
    }
  }
  TRUE
}

separateChimeras <- function(bam, targets, tolerance = 5,
                             by.flag = TRUE, verbose = FALSE){

  # The supplementary alignment flag must be set to distinguish paired from
  # chimeric reads. Tolerance is added on both sides
  # A better approach might be to explicitly consider where chimeras
  # join w.r.t read
  # Worth warning if there are chimeras independent of the guide?

  # Find chimeras
  ch_idxs <- findChimeras(bam, by.flag)
  chimeras <- bam[ch_idxs]
  original_ln <- length(chimeras)

  # Setup return data
  chimerasByPCR <- vector("list", length(targets))
  names(chimerasByPCR) <- as.character(seq_along(targets))

  # If the target is completely contained within one member of the
  # chimeric set (two members if paired), do not count it as a chimera
  guide_within <- subjectHits(findOverlaps(targets, chimeras,
                              ignore.strand = TRUE, type = "within"))
  ordered <- unique(guide_within[order(guide_within)])

  # If any targets contained within a chimeric read, remove all members of set
  if (length(ordered) > 0){
    lns_rle <- rle(names(chimeras)[ordered])$lengths
    grps <- rep(1:length(lns_rle), lns_rle)
    is_first <- paste(grps, bitwAnd(mcols(chimeras)$flag[ordered], 64),
                      sep = ".")
    not_dup <- !(duplicated(is_first) | duplicated(is_first, fromLast = TRUE))

    # Remove all chimeras with guides included from the chimeric sets
    non_ch <- names(chimeras) %in% names(chimeras)[ordered][not_dup]
    ch_idxs <- ch_idxs[!non_ch]
    chimeras <- bam[ch_idxs]
  }

  # Assign chimeras to targets
  tgt_plus_tol <- targets + tolerance
  hits <- findOverlaps(chimeras, tgt_plus_tol, ignore.strand = TRUE)

  # Exclude members that match multiple targets
  is_dup <- duplicated(queryHits(hits)) | duplicated(queryHits(hits), fromLast = TRUE)
  hits <- hits[!is_dup]
  splits <- split(queryHits(hits), subjectHits(hits))

  #For each hit, collect all alignments with the same name
  idx_by_primer <- lapply(splits, function(idxs){
    ch_idxs[names(chimeras) %in% names(chimeras)[idxs]]
  })

  ibp <- unlist(idx_by_primer)

  alnsByPCR <- lapply(idx_by_primer, function(ids){ bam[ids]})
  chimerasByPCR[names(splits)] <- alnsByPCR

  if (isTRUE(verbose)){
    # How many chimeric sets were not assigned?
    n_inc <- length(unique(ibp))
    n_total <- length(ch_idxs)
    n_dup <- sum(duplicated(ibp) | duplicated(ibp, fromLast = TRUE))
    pct_inc <- n_inc/n_total * 100
    pct_multi <- n_dup/n_total * 100
    removed <- original_ln - length(chimeras)

    rm_pct <- removed/original_ln * 100
    message(sprintf(paste0("%s from %s (%.2f%%) chimeras did not involve guide\n",
                    "%s from %s (%.2f%%) remaining chimeric reads included\n",
                    "%s (%.2f%%) assigned to more than one target\n"),
                removed, original_ln, rm_pct, n_inc, n_total,
                pct_inc, n_dup, pct_multi))
  }

  # Remove chimeras from the bam
  if (length(ch_idxs) >= 2){
    bam <- bam[-ch_idxs]
  }
  # Return list of chimerasByPCR and bam
  result <- list(bam = bam, chimerasByPCR = chimerasByPCR)
  result
}



alnsToCrisprSet <- function(alns, reference, target, reverse.complement,
                            collapse.pairs, names, use.consensus, target.loc,
                            verbose, chimeras = NULL,
                            store.chimeras = FALSE, ...){

  crispr.runs <- lapply(seq_along(alns), function(i){
    aln <- alns[[i]]

    if (!class(aln) == "GAlignments") {
      aln <- GenomicAlignments::GAlignments()
    }

    chim <- chimeras[[i]]
    if (is.null(chim)) {chim <- GenomicAlignments::GAlignments()}
    crun <- readsToTarget(aln, target = target,
                reverse.complement = reverse.complement, chimeras = chim,
                collapse.pairs = collapse.pairs, use.consensus = use.consensus,
                verbose = verbose, name = names[i])
    crun
  })

  to_rm <- sapply(crispr.runs, is.null)
  if (any(to_rm)){
    if (verbose){
      rm_nms <- paste0(names[to_rm], collapse = ",", sep = "\n")
      message(sprintf("Excluding samples that have no on target reads:\n%s",
                  rm_nms))
    }
    crispr.runs <- crispr.runs[!to_rm]
    names <- names[!to_rm]
    if (length(crispr.runs) == 0) {
      warning("Could not narrow reads to target, no samples have on-target alignments")
      return()
    }
  }

  rc <- rcAlns(as.character(strand(target)),reverse.complement)

  cset <- CrisprSet(crispr.runs, reference, target, rc = rc, target.loc = target.loc,
                    verbose = verbose, names = names, ...)
  cset
}


#'@title Internal CrispRVariants function for reading and filtering a bam file
#'@description Includes options for excluding reads either by name or range.
#'The latter is useful if chimeras are excluded.  Reads are excluded before
#'chimeras are detected, thus a chimeric read consisting of two sections, one of
#'which overlaps an excluded region, will not be considered chimeric.
#'Chimeric reads can be ignored, excluded, which means that all sections of a
#'chimeric read will be removed, or merged, which means that chimeras will be
#'collapsed into a single read where possible. (Not implemented yet)
#'If chimeras = "merge", chimeric reads are merged if all segments
# are from the same chromosome, do not overlap, and are aligned to the same strand.
# It is assumed that sequences with two alignments are chimeras, not alternate mappings
#'@param file The name of a bam file to read in
#'@param target A GRanges object containing a single target range
#'@param exclude.ranges A GRanges object of regions that should not be counted,
#'e.g. primer or cloning vector sequences that have a match in the genome
#'@param exclude.names A vector of read names to exclude.
#'@param chimera.to.target Maximum distance between endpoints of chimeras and
#'target.loc for assigning chimeras to targets (default: 5)
#'@param chimeras Flag to determine how chimeric reads are treated.  One of
#'"ignore", "exclude", "count" and "merge".  Default "ignore".
#'@param verbose Print stats about number of alignments read and filtered.  (Default: TRUE)
#'@param by.flag Is the supplementary alignment flag set?  Used for identifying chimeric
#'alignments, function is much faster if TRUE.  Not all aligners set this flag.  If FALSE,
#'chimeric alignments are identified using read names (Default: TRUE)
#'@return A GenomicAlignments::GAlignment obj
readTargetBam <- function(file, target, exclude.ranges = GRanges(),
                          exclude.names = NA, chimera.to.target = 5,
                          chimeras = c("count", "ignore","exclude","merge"),
                          by.flag = TRUE, verbose = TRUE){

  ch.action <- match.arg(chimeras)
  if (ch.action == "ignore"){
    # If chimeras are not to be excluded or merged,
    # we only need to read in reads overlapping the target region
    param <- Rsamtools::ScanBamParam(what = c("seq", "flag"), which = target)
  } else {
    # In this case, must read in the entire bam to be sure of finding chimeric reads
    param <- Rsamtools::ScanBamParam(what = c("seq", "flag"))
  }
  bam <- GenomicAlignments::readGAlignments(file, param = param, use.names = TRUE)
  if (length(bam) == 0){
    return(list(bam = GenomicAlignments::GAlignments(),
                chimeras = GenomicAlignments::GAlignments()))
  }

  # Check that "seq" is not empty
  unq_wdths <- unique(width(mcols(bam)$seq))
  if (length(unq_wdths) == 0){
    if (unq_wdths == 0) stop("No sequence found in bam file")
  }

  #Exclude reads by name or range
  temp <- excludeFromBam(bam, exclude.ranges, exclude.names)

  if (isTRUE(verbose)){
    original <- length(bam)
    message(sprintf("Read %s alignments, excluded %s\n", original,
                    original - length(temp)))
  }
  bam <- temp

  if (length(bam) == 0 | ch.action == "ignore"){
    return(list(bam = bam, chimeras = GenomicAlignments::GAlignments()))
  }

  chimera_idxs <- findChimeras(bam, by.flag = by.flag)

  if (chimeras == "exclude"){
    if( length(chimera_idxs) >= 2){
      bam <- bam[-chimera_idxs]
    }
    if (isTRUE(verbose)){
      message(sprintf("%s reads after filtering chimeras\n", length(bam)))
    }
    return(list(bam = bam, chimeras = GenomicAlignments::GAlignments()))
  }
  if (chimeras == "count"){
    temp <- separateChimeras(bam, target, tolerance = chimera.to.target,
                             by.flag = by.flag, verbose = verbose)
    return(list(bam = temp$bam, chimeras = temp$chimeras[[1]]))
  }
  if (chimeras == "merge"){
    warning("Merging chimeras not implemented yet, ignoring chimeras")
    return(list(bam = bam, chimeras = GenomicAlignments::GAlignments()))
  }
}


#'@title Internal CrispRVariants function for deciding whether to reverse
#'complement aligned reads
#'@description Guides on the negative strand may be displayed with respect
#' to either strand.  This function checks whether a guide is on the negative
#' strand and should be reverse complemented.
#'@param target.strand Strand of the target region
#'@param reverse.complement Should the alignment be oriented to match the strand
#'@return A logical value indicating whether the narrowed alignment should be reverse complemented.
#'@author Helen Lindsay
rcAlns <- function(target.strand, reverse.complement){
  if (target.strand == "-" & isTRUE(reverse.complement)) return(TRUE)
  FALSE
}


#'@title Narrow a set of aligned reads to a target region
#'@description Aligned reads are narrowed to the target region.  In
#'the case of reads with deletions spanning the boundaries of the target,
#'reads are narrowed to the start of the deletion,
#'@param alns A GAlignments object including a metadata column "seq"
#'containing the sequence
#'@param target A GRanges object
#'@param reverse.complement Should the aligned reads be reverse complemented?
#'@param verbose (Default: FALSE)
#'@param ... additional arguments
#'@author Helen Lindsay
#'@rdname narrowAlignments
#'@export
setGeneric("narrowAlignments", function(alns, target, ...) {
  standardGeneric("narrowAlignments")})

#'@return The narrowed alignments (GAlignments),
#'@rdname narrowAlignments
#'@examples
#'bam_fname <- system.file("extdata", "gol_F1_clutch_2_embryo_4_s.bam",
#'                          package = "CrispRVariants")
#'bam <- GenomicAlignments::readGAlignments(bam_fname, use.names = TRUE)
#'target <- GenomicRanges::GRanges("18", IRanges::IRanges(4647377, 4647399),
#'           strand = "+")
#'narrowAlignments(bam, target, reverse.complement = FALSE)
setMethod("narrowAlignments", signature("GAlignments", "GRanges"),
          function(alns, target, ..., reverse.complement, verbose = FALSE){

    # Narrowing example:
    # 3-4-5-6-7-8-9-10 Read
    #     5-6-7-8      Target sequence
    # target_start 5 - (read_start 3 - 1) = index 3
    # target_end 8 - target_start 5 + cigstart 3 = index

    # Notes:
    # Cannot directly narrow the alignments as the seqs aren't narrowed
    # and want to keep indel operations bordering targe range

    # alns must span target
    alns <- alns[start(alns) <= start(target) & end(alns) >= end(target)]
    m_cols <- as.list(mcols(alns))
    if ("qual" %in% names(m_cols) & ! "seq" %in% names(m_cols)){
      stop("Metadata col 'seq' must be present if 'qual' is present")
    }

    if (isTRUE(verbose)) message("narrowing alignments\n")

    ref_ranges <- GenomicAlignments::cigarRangesAlongReferenceSpace(cigar(alns))
    genomic <- GenomicRanges::shift(ref_ranges, start(alns)-1)

    # is unlist/relist needed
    # Find the on target operations
    clipped <- unlist(GenomicAlignments::explodeCigarOps(cigar(alns)))
    clipped <- clipped %in% c("S","H")
    on_target <- unlist(start(genomic) <= end(target) & end(genomic) >= start(target))
    on_target <- on_target & ! clipped
    on_target <- relist(on_target, ref_ranges)

    # Make masks to pull out first and last on-target ranges
    first_on_tg <- ! duplicated(on_target) & on_target
    last_on_tg <- ! duplicated(on_target, fromLast = TRUE) & on_target

    # Find cases where the first or last on target operation is a match ("M")
    ops <- unlist(explodeCigarOps(cigar(alns)))
    first_op_m <- ops[unlist(first_on_tg)] == "M"
    last_op_m <- ops[unlist(last_on_tg)] == "M"

    # Narrow border "M" operations to match target
    # need to know where the target start (genomic) is wrt sequence (query) coordinates
    cig <- GenomicAlignments::cigar(alns)
    q_ranges <- GenomicAlignments::cigarRangesAlongQuerySpace(cig)
    sq_starts <- start(q_ranges)[first_on_tg]

    genomic_offset_fom <- start(target) - start(genomic)[first_on_tg][first_op_m]
    sq_starts[first_op_m] <- sq_starts[first_op_m] + genomic_offset_fom
    genomic_starts <- start(genomic)[first_on_tg]
    genomic_starts[first_op_m] <- start(target)

    sq_ends <- end(q_ranges)[last_on_tg]
    genomic_offset_lom <- end(target) - start(genomic)[last_on_tg][last_op_m]
    sq_ends[last_op_m] <- start(q_ranges)[last_on_tg][last_op_m] + genomic_offset_lom

    # Adjust reference ranges to match
    ref_ranges <- unlist(ref_ranges)
    is_first_m <- which(unlist(first_on_tg))[first_op_m]

    # Calculate new ends before changing starts
    is_last_m <- which(unlist(last_on_tg))[last_op_m]
    lom_ends <- as.integer(start(ref_ranges[is_last_m]) + genomic_offset_lom)

    # Now adjust starts and ends
    start(ref_ranges[is_first_m]) <- as.integer(start(ref_ranges[is_first_m]) +
                                                        genomic_offset_fom)
    end(ref_ranges[is_last_m]) <- lom_ends

    # Get the width of on target reference ranges, recreate cigars
    # The width is the reference range unless an insertion, in which case the query range
    ref_ranges <- relist(ref_ranges, genomic)
    ref_ranges <- ref_ranges[on_target]
    qwdths <- unlist(width(q_ranges)[on_target])
    wdths <- width(ref_ranges)
    wdthl <- unlist(wdths)
    opsl <- ops[unlist(on_target)]
    wdthl[opsl == "I"] <- qwdths[opsl == "I"]
    new_cigs <- sapply(relist(paste0(wdthl, opsl), wdths), paste0, collapse = "")

    ga_params <- list(seqnames = seqnames(alns), pos = as.integer(genomic_starts),
                      cigar = new_cigs, names = names(alns), strand = strand(alns),
                      seqlengths = GenomeInfoDb::seqlengths(alns))

    if ("seq" %in% names(m_cols)){
      m_cols$seq <- subseq(m_cols$seq, start = as.numeric(sq_starts),
                          end = as.numeric(sq_ends))
    }
    if ("qual" %in% names(m_cols)){
      m_cols$qual <- subseq(m_cols$qual, start = as.numeric(sq_starts),
                          end = as.numeric(sq_ends))
    }

    ga_params <- c(ga_params, m_cols)
    new_alns <- do.call(GAlignments, ga_params)

    new_alns
})

#'@title Internal CrispRVariants function for collapsing pairs with concordant indels
#'@description Given a set of alignments to a target region, finds read pairs.
#'Compares insertion/deletion locations within pairs using the cigar string.
#'Pairs with non-identical indels are excluded.  Pairs with identical indels are
#'collapsed to a single read, taking the consensus sequence of the pairs.
#'@param alns A GAlignments object.  We do not use GAlignmentPairs because amplicon-seq
#'can result in pairs in non-standard pairing orientation.
#'Must include BAM flag, must not include unmapped reads.
#'@param use.consensus Should the consensus sequence be used if pairs have a mismatch?
#'Setting this to be TRUE makes this function much slower (Default: TRUE)
#'@param keep.unpaired Should unpaired and chimeric reads be included?  (Default: TRUE)
#'@param verbose Report statistics on reads kept and excluded
#'@param ... Additional items with the same length as alns,
#'that should be filtered to match alns.
#'@return The alignments, with non-concordant pairs removed and concordant pairs
#'represented by a single read.
#'@author Helen Lindsay
collapsePairs <- function(alns, use.consensus = TRUE, keep.unpaired = TRUE,
                          verbose = TRUE, ...){
  dots <- list(...)

  if (! unique(sapply(dots, length)) == length(alns)){
    stop("Each ... argument supplied must have the same length as the alignments")
  }

  # 1 = 2^0 = paired flag
  # 2048 = 2^11 = supplementary alignment flag
  is_primary <- !(bitwAnd(mcols(alns)$flag, 2048) & bitwAnd(mcols(alns)$flag, 1))
  pairs <- findChimeras(alns[is_primary], by.flag = FALSE) # This just matches read names

  # If there are no pairs, no need to do anything further
  if (length(pairs) == 0){
    if (isTRUE(keep.unpaired)){
      return(c(list("alignments" = alns), dots))
    } else {
      return(NULL)
    }
  }
  # Pairs are primary alignments with the same name
  nms <- rle(names(alns)[is_primary][pairs])
  nms_codes <- rep(1:length(nms$lengths), nms$lengths)

  # If reads have the same insertions and deletions, they have identical cigar strings
  cig_runs <- rle(paste(cigar(alns)[is_primary][pairs], nms_codes, sep = "."))$lengths
  concordant <- rep(cig_runs, cig_runs) == rep(nms$lengths,nms$lengths)

  # Keep first alignment from all concordant pairs
  # Flag 64 = 2^6 = first alignment in pair
  is_pair <- which(is_primary)[pairs]
  is_first <- as.logical(bitwAnd(mcols(alns)$flag[is_pair], 64))
  keep <- is_pair[concordant & is_first]

  if (verbose){
    nunpaired <- length(alns) - length(is_pair)
    cc_true <- sum(concordant)/2
    cc_false <- sum(!concordant)/2
    stats <- paste0("\nCollapsing paired alignments:\n",
              "%s original alignments\n",
              "  %s are not part of a primary alignment pair\n",
              "     (singletons and chimeras)\n",
              "  %s reads are paired \n",
              "    %s pairs have the same insertions/deletions\n",
              "    %s pairs have different insertions/deletions\n",
              "Keeping the first member of %s concordant read pairs\n")
    message(sprintf(stats, length(alns), nunpaired, length(is_pair),
                cc_true, cc_false, cc_true))
  }
  if (keep.unpaired){
    # Keep non-pairs, including non-primary and singletons
    keep <- c(keep, setdiff(c(1:length(alns)),is_pair))
    if (verbose) message(sprintf("Keeping %s unpaired reads\n", nunpaired))
  }
  keep_alns <- alns[keep]

  if (use.consensus){
    seq_runs <- rle(paste0(nms_codes, mcols(alns[pairs])$seq))$lengths
    same_seq <- rep(seq_runs, seq_runs) == rep(nms$lengths, nms$lengths)

    ncc_seqs <- mcols(alns[pairs][concordant & !same_seq])$seq
    if (verbose){
      message(sprintf("Finding consensus for %s pairs with mismatches\n",
                  length(ncc_seqs)/2))
    }
    if (length(ncc_seqs) >= 2){
      consensus <- sapply(seq(1,length(ncc_seqs), by = 2), function(i){
      Biostrings::consensusString(ncc_seqs[i:(i+1)])
      })
      # Overwrite the sequence of the non-concordant pairs.
      # The concordant alignments are at the start of keep
      ncc_idxs <- cumsum(concordant & is_first)[concordant & is_first & !same_seq]
      mcols(keep_alns[ncc_idxs])$seq <- Biostrings::DNAStringSet(consensus)
    }
  }
  if (length(keep) == 0) return(NULL)
  filtered.dots <- lapply(dots, function(x) x[keep])

  result <- c(list("alignments" = keep_alns), filtered.dots)
  result
}
markrobinsonuzh/CrispRVariants documentation built on May 21, 2019, 12:23 p.m.