trellis: Somatic structural variant analysis

Documented in addFocalDupsFlankingAmplicon AmpliconGraph ampliconParams focalAmpliconDupRanges groupAmplicons joinNearGRanges makeAGraph recurrentDrivers sv_amplicon_exp sv_amplicons sv_amplicons2

#' @include help.R
NULL

setClass("AnchoredReadPairs",
         representation(zero_end="GAlignmentPairs",
                        single_end="GAlignmentPairs",
                        both_ends="GAlignmentPairs"))

AnchoredReadPairs <- function(zero_end, single_end, both_ends=GRanges()){
  if(missing(zero_end)) new("AnchoredReadPairs")
  new("AnchoredReadPairs", zero_end=zero_end,
      single_end=single_end, both_ends=both_ends)
}

setMethod("zeroEndAnchors", "AnchoredReadPairs",
          function(object) object@zero_end)
setMethod("singleEndAnchors", "AnchoredReadPairs",
          function(object) object@single_end)
setMethod("bothEndAnchors", "AnchoredReadPairs",
          function(object) object@both_ends)


FilterEdgeParam <- function(freq=5, minimum_maxdist=50, bad_bins=GRanges()){
  list(freq=freq,
       minimum_maxdist=minimum_maxdist,
       bad_bins=bad_bins)
}

gaps0 <- function(x){
  g <- gaps(x)
  g <- g[strand(g)=="*"]
  g
}

hgnc <- function(g) g$hgnc

isDuplication <- function(object, minimum_foldchange=1){
  ## - must not already be called an amplicon
  ## - must be somatic
  ## - must have elevated copy number
  !isAmplicon(object) & (object$seg.mean >= minimum_foldchange) &
    !is.na(object$seg.mean) & isSomatic(object) & width(object) < 500e3
}


#' @aliases filterBy,GRanges,GRanges-method
#' @rdname filterBy-methods
setMethod("filterBy", c("GRanges", "GRanges"),
          function(query, subject, ...){
            dropbins <- unique(queryHits(findOverlaps(query, subject, type=type, ...)))
            if(length(dropbins) > 0)
              query <- query[-dropbins]
            return(query)
            ##query[!overlapsAny(query, subject, ...)]
          })

node2 <- function(name, sep="-") sapply(name, function(x) strsplit(x, sep)[[1]][2])
node1 <- function(name, sep="-") sapply(name, function(x) strsplit(x, sep)[[1]][1])

#' Lists germline filters and parameters for filtering amplicons
#'
#' 
#' @export
#'
#' @examples
#' params <- ampliconParams("hg19")
#'
#' @return Returns a list of germline filters and some of the hard
#'   thresholds used in the amplicon analysis.
#'
#' @param AMP_THR numeric threshold for high copy amplicon
#'
#' @param LOW_THR numeric a lower threshold used when considering
#'   amplicons that are bridged by improper read pairs to a high copy
#'   amplicon
#'
#' @param border_size used to construct a query (GRanges object) for
#'   additional amplicons neighboring a focal amplicon. TODO: more
#'   detail needed.
#'
#' @param overhang An integer indicating how much to expand the
#'   germline filter on each size.  Using \code{overlapsAny}, a
#'   determination is made whether an amplicon is part-germline (any
#'   overlap) or no-germline (no overlap).  If the amplicon is
#'   completely within the extended germline filter, the amplicon is
#'   considered fully-germline.
#'
#' @param MIN_WIDTH length-one integer vector indicating the minimum size of amplicon (see also \code{joinNearGRanges})
#'
#' @param MIN_SEGMEAN_DIFF length-one numeric vector.  Adjacent segments whose means differ by less than this value are candidates for merging by \code{{joinNearGRanges}}
#'
#' @param min.gapwidth length-one numeric vector passed to the \code{reduce} method that merges adacent segments with comparable segment means
#'
#' @param maxgap length-one numeric vector. Passed to \code{findOverlaps} when
#'   evaluating whether the amplicon ranges overlap with other amplicons (see
#'   \code{linkNearAmplicons}).
#'
#' @param frequency minimum number of read pairs 
#' 
#' @param minimum_maxdist TODO
#'
#' @param minimum_count a length-one integer vector: minimum number of
#'   improperly paired reads required to link two amplicons
#' 
#' @param bad_bins a \code{GRanges} object of problematic bins
ampliconParams <- function(AMP_THR=log2(2.75),
                           LOW_THR=log2(1.75),
                           border_size=10e3,
                           overhang=25e3,
                           MIN_WIDTH=2000,
                           MIN_SEGMEAN_DIFF=0.05,
                           min.gapwidth=3000,
                           maxgap=500e3,
                           frequency=5,
                           minimum_maxdist=50,
                           minimum_count=5,
                           bad_bins=GRanges()){
  ##filters <- listGenomeFilters()
  edge <- FilterEdgeParam(freq=5,
                          minimum_maxdist=minimum_maxdist,
                          bad_bins=bad_bins)
  filters <- list()
  filters$border_size <- border_size
  filters$overhang <- overhang
  filters$AMP_THR <- AMP_THR
  filters$LOW_THR <- LOW_THR
  filters$MIN_WIDTH <- MIN_WIDTH
  filters$MIN_SEGMEAN_DIFF <- MIN_SEGMEAN_DIFF
  filters$min.gapwidth <- min.gapwidth
  filters$maxgap <- maxgap
  filters$minimum_maxdist=minimum_maxdist
  filters$minimum_count=minimum_count
  filters$edge <- edge
  filters
}

overlapsGermline <- function(object, germline, overhang=5e3){
  filter_levels <- c("no_germline", "part_germline", "fully_germline")
  start(germline) <- pmax(start(germline)-overhang, 1)
  sl <- seqlengths(germline)[as.character(seqnames(germline))]
  end(germline) <- pmin(end(germline)+overhang, sl)
  germline <- reduce(germline)
  any_overlap <- overlapsAny(object, germline)
  filtered <- ifelse(any_overlap, "part_germline", "no_germline")
  filtered[any_overlap] <- "part_germline"
  complete_overlap <- findOverlaps(object, germline, type="within")
  filtered[unique(queryHits(complete_overlap))] <- "fully_germline"
  filtered <- factor(filtered, levels=filter_levels)
  filtered
}

standardizeGRangesMetadata <- function(granges){
  if(length(granges) == 0) return(granges)
  mns <- granges$seg.mean
  is_amplicon <- granges$is_amplicon
  granges <- reduce(granges)
  granges$seg.mean <- mns
  granges$is_amplicon <- is_amplicon
  names(granges) <- ampliconNames(granges)

  granges$hgnc <- as.character(NA)
  granges$driver <- as.character(NA)
  granges$biol_sign <- as.character(NA)
  granges$groups <- as.factor(NA)
  granges
}

isNotAmplicon <- function(object) !isAmplicon(object)
isGap <- function(object) width(object)==999 & is.na(object$seg.mean)

gapBetweenLowCopyRanges <- function(g){
  index_lowcopy <- which(isNotAmplicon(g))
  index_gap1kb <- which(isGap(g))
  flanked_by_lowcopy <- (index_gap1kb - 1) %in% index_lowcopy &
    (index_gap1kb + 1) %in% index_lowcopy
  gap1kb <- g[index_gap1kb]
  gap1kb[flanked_by_lowcopy]
}

startAmpliconBoundary <- function(ranges, is_gap=TRUE, extend=2e3){
  ### NOT RUN: is_gap is never specified when this function is called
  ###          so the value defaults to TRUE
  if(!is_gap){
    candidate_amplicon <- ranges$seg.mean > 0.8 & !is.na(ranges$seg.mean)
    if(any(!candidate_amplicon)){
      start(ranges)[!candidate_amplicon] <- end(ranges)[!candidate_amplicon]
    }
  }
  ###
  is_small <- width(ranges) < extend
  if(any(is_small)){
    small_amplicons <- ranges[is_small]
    if(all(is_small)) return(small_amplicons)
  }
  amplicons <- ranges[!is_small]
  cnv_begin_edge <- amplicons
  start(cnv_begin_edge) <- start(amplicons)
  end(cnv_begin_edge) <- start(amplicons)+extend
  if(any(is_small)){
    cnv_begin_edge <- sort(c(cnv_begin_edge, small_amplicons))
  }
  cnv_begin_edge
}

endAmpliconBoundary <- function(ranges, is_gap=FALSE, extend=2e3){
  ### NOT RUN: is_gap is never specified when this function is called
  ###          so the value defaults to TRUE
  if(!is_gap){
    candidate_amplicon <- ranges$seg.mean > 0.8 & !is.na(ranges$seg.mean)
    if(any(!candidate_amplicon)){
      start(ranges)[!candidate_amplicon]<- end(ranges)[!candidate_amplicon]
    }
  }
  ###
  is_small <- width(ranges) < extend
  if(any(is_small)){
    small_amplicons <- ranges[is_small]
    if(all(is_small)) return(small_amplicons)
  }
  amplicons <- ranges[!is_small]
  if(length(amplicons) == 0) return(amplicons)
  cnv_end_edge <- amplicons
  start(cnv_end_edge) <- end(amplicons)-extend
  end(cnv_end_edge) <- end(amplicons)
  if(any(is_small)){
    cnv_end_edge <- sort(c(cnv_end_edge, small_amplicons))
  }
  cnv_end_edge
}

borderSize <- function(object) object@border_size

germlineCNV <- function(object) object@germline_cnv

outliers <- function(object) object@outliers

ampliconQueryRanges <- function(object, min.fc=1){
  g <- ranges(object)
  gap1kb <- gapBetweenLowCopyRanges(g)
  starts <- startAmpliconBoundary(g, extend=borderSize(object))
  ends <- endAmpliconBoundary(g, extend=borderSize(object))
  g <- c(starts, ends)
  names(g) <- ampliconNames(g)
  g <- g[!duplicated(names(g))]
  g <- g[width(g) > 1]
  g <- g[!is.na(g$seg.mean)]
  g <- g[g$seg.mean > min.fc | isGap(g)]
  ## drop 999bp intervals between low copy ranges
  g <- g[!names(g) %in% names(gap1kb)]
  assembly_gaps <- assemblyGaps(object)
  if(length(assembly_gaps)==0) {
    queryRanges(object) <- g
    return(object)
  }
  ## drop ranges contained within assembly gaps
  assembly_gaps <- reduce(assembly_gaps)
  o <- findOverlaps(g, assembly_gaps, type="within")
  if(length(o) > 0){
    g <- g[-unique(queryHits(o))]
    if(length(g)==0) warning("All queryRanges are within assembly gaps")
  }
  ##  ## drop ranges contained within germline CNV / outliers gaps
  germline <- sort(reduce(c(germlineCNV(object), outliers(object))))
  o <- findOverlaps(g, germline, type="within")
  if(length(o) > 0){
    g <- g[-unique(queryHits(o))]
    if(length(g)==0) warning("All queryRanges are within germline")
  }
  queryRanges(object) <- g
  object
}

setMethod("combine", signature(x="GRanges", y="GRanges"),
          function(x, y,...){
            col.names <- intersect(colnames(mcols(x)), colnames(mcols(y)))
            ##col.names <- union(colnames(mcols(x)), colnames(mcols(y)))
            mcols(x) <- mcols(x)[, col.names]
            mcols(y) <- mcols(y)[, col.names]
            colnames(mcols(x)) <- colnames(mcols(y)) <- col.names
            xy <- c(x, y)
            xy
          })

.add_mcols <- function(g){
  g$seg.mean <- NA
  g$is_amplicon <- FALSE
  g$hgnc <- as.character(NA)
  g$driver <- as.character(NA)
  g$biol_sign <- as.character(NA)
  g$groups <- as.factor(NA)
  g
}

.empty_filters <- function(assembly_gaps=GRanges(),
                           centromeres=GRanges(),
                           germline_cnv=GRanges(),
                           outliers=GRanges()){
  list(assembly_gaps=assembly_gaps,
       centromeres=centromeres,
       germline_cnv=germline_cnv,
       outliers=outliers)
}

#' Constructor for AmpliconGraph
#'
#' Constructor for \code{AmpliconGraph}.
#'
#' @param ranges a \code{GRanges} object of the segmented normalized copy number 
#' @param filters a list of germline CNV and sequence-based filters
#' @param params a list of parameters
#' @rdname AmpliconGraph-constructor
#' @export
AmpliconGraph <- function(ranges=GRanges(),
                          filters,
                          params=ampliconParams(border_size=150e3, overhang=5e3)){
  if(missing(filters)){
    filters <- .empty_filters()
  }
  assembly_gaps <- filters[["assembly_gaps"]]
  centromeres <- filters[["centromeres"]]
  germline_cnv <- filters[["germline_cnv"]]
  outliers <- filters[["outliers"]]
  border_size <- params[["border_size"]]
  overhang <- params[["overhang"]]
  if(missing(ranges)) {
    ag <- new("AmpliconGraph")
    names(ranges(ag)) <- character()
    return(ag)
  }
  ranges <- standardizeGRangesMetadata(ranges)
  assembly_gaps <- reduce(sort(combine(assembly_gaps, centromeres)))
  ##
  ## Annotate GRanges
  ##
  ##  gaps
  ##
  g <- gaps0(ranges)
  if(length(g) > 0){
    g <- .add_mcols(g)
    ranges <- sort(c(ranges, g))
  }
  ##
  germ <- reduce(c(germline_cnv, outliers))
  ranges$overlaps_germline <- overlapsGermline(ranges, germ,
                                               overhang=overhang)
  names(ranges) <- ampliconNames(ranges)
  amps <- ranges[isAmplicon(ranges) & isSomatic(ranges)]
  tmp <- new("AmpliconGraph",
             graph=graphNEL(nodes=names(amps)),
             ranges=ranges,
             border_size=border_size,
             assembly_gaps=assembly_gaps,
             germline_cnv=germline_cnv,
             outliers=outliers)
  ## initialize query ranges
  ampliconQueryRanges(tmp, min.fc = params[["LOW_THR"]])
}

trimRangesOverlappingCentromere <- function(object, centromeres){
  if(numEdges(graph(object)) > 0) {
    stop("currently this method does not preserve edges")
  }
  rg <- ranges(object)
  sl <- seqlevels(rg)
  si <- seqinfo(rg)
  spanned_by_gap <- unique(queryHits(findOverlaps(rg, centromeres, type="within")))
  if(length(spanned_by_gap) > 0){
    rg <- rg[-spanned_by_gap]
  }
  overlaps_centromere <- overlapsAny(rg, centromeres)
  if(any(overlaps_centromere)){
    rgs <- rg[overlaps_centromere]
    trimmed <- setdiff(rgs, centromeres)
    j <- subjectHits(findOverlaps(trimmed, rgs))
    ## only keep one side
    tmp <- sapply(split(trimmed, j), function(g) g[which.max(width(g))])
    trimmed <- unlist(GRangesList(tmp))
    j <- subjectHits(findOverlaps(trimmed, rgs))
    trimmed$seg.mean <- rgs$seg.mean[j]
    trimmed$is_amplicon <- rgs$is_amplicon[j]
    trimmed$hgnc <- rgs$hgnc
    trimmed$driver <- rgs$driver
    trimmed$biol_sign <- rgs$biol_sign
    trimmed$groups <- rgs$groups
    trimmed$overlaps_germline <- rgs$overlaps_germline
    ## drop regions that were gaps to begin with
    trimmed <- trimmed[!is.na(trimmed$seg.mean)]
    rg <- filterBy(rg, rgs)
    rg <- sort(c(rg, trimmed))
    names(rg) <- ampliconNames(rg)
    rg <- keepSeqlevels(rg, sl, pruning.mode="coarse")
    seqinfo(rg) <- si
    ranges(object) <- rg
  }
  ag <- graphNEL(nodes=names(ampliconRanges(object)))
  graph(object) <- ag
  object
}

#' Merge adjacent amplicons that have a similar segment mean
#'
#' Merges adjacent amplicons (<= 3kb separation) that have similar
#' segment means.
#'
#' @keywords internal
#' @export
#' @return a \code{GRanges} object of the merged segments
#'
#' @param object a \code{GRanges} object
#'
#' @param thr a length-one numeric vector. If the difference of the average log
#'   ratio for two segments is less than this number, combine the segments. The
#'   segment mean of the merged GRanges will be the weighted average, where the
#'   weights are the segment widths.
#' 
#'
#' @param MIN_WIDTH minimum size of amplicon (default 2000)
#'
#' @details Merging is performed by the \code{reduce} operation.
#' 
#'
joinNearGRanges <- function(object, params){
  ##thr=0.05, MIN_WIDTH=2e3, min.gapwidth=3000){
  thr <- params[["MIN_SEGMEAN_DIFF"]]
  MIN_WIDTH <- params[["MIN_WIDTH"]]
  min.gapwidth <- params[["min.gapwidth"]]
  k <- which(!is.na(object$seg.mean) & width(object) > MIN_WIDTH)
  if(length(k) == 0) return(object)
  g <- object[k]
  means <- g$seg.mean
  d <- diff(means)
  if(all(d > thr)) return(object)
  regions <- paste0("region", cumsum(c(0, abs(d) > thr)))
  regions <- factor(regions, levels=unique(regions))
  indexlist <- split(seq_along(regions), regions)
  ##
  ## list of regions.  Each element is a vector of regions that can be
  ## joined
  ##
  glist <- vector("list", length(indexlist))
  for(i in seq_along(indexlist)){
    j <- indexlist[[i]]
    if(length(j)==1) next()
    ng <- reduce(g[j], min.gapwidth=min.gapwidth)
    ##
    ## TODO: combining the metadata.  Here, we're just taking the first row
    ##
    mc <- mcols(g)[j[1], , drop=FALSE]
    mc$seg.mean <- sum(width(g[j])*g$seg.mean[j])/sum(width(g[j]))
    mcols(ng) <- mc
    glist[[i]] <- ng
  }
  gnew <- unlist(GRangesList(unlist(glist)))
  ##  Remove regions in the original object that overlap with the
  ## joined regions (g contains all the original intervals that were
  ## subsequently joined)
  object2 <- filterBy(object, gnew)
  object3 <- c(object2, gnew)
  object3$seg.mean <- round(object3$seg.mean, 2)
  object3 <- sort(object3)
  object3
}

flankingRanges <- function(object, shift=2){
  rgs <- ranges(object)
  amplicons <- ampliconRanges(object)
  index <- match(names(amplicons), names(rgs))
  index2 <- index - shift  
  index2 <- index2[index2 > 0]
  index3 <- index + shift
  is_right_flank <- (index3 < length(rgs)) & (as.character(seqnames(rgs[index3])) == as.character(seqnames(rgs[index])))
  # This just fixes the issue at the right flank. Couldn't this happen at the left flank as well? 
  if(any(!is_right_flank)){
    ## REFACTOR.  No right flank exists.  If we exclude, linkedDuplicatedRanges will fail
    ## assign the amplicon index
    index3[!is_right_flank] <- index[!is_right_flank]
  }  
  left_flank <- rgs[index2]
  right_flank <- rgs[index3]
  left <- list(first=left_flank, last=amplicons)
  right <- list(first=amplicons, last=right_flank)
  list(left=left, right=right)
}

flankingDuplications <- function(object, minimum_foldchange=1){
  flanks <- flankingRanges(object,  shift=2)
  is_dupL <- isDuplication(flanks$left[[1]], minimum_foldchange)
  is_dupR <- isDuplication(flanks$right[[2]], minimum_foldchange)
  flanks$left <- lapply(flanks$left, "[", is_dupL)
  flanks$right <- lapply(flanks$right, "[", is_dupR)
  flanks
}

minimumBasepairCoverage <- function(rp, regions){
  dj <- disjoin(rp)
  dj <- dj[overlapsAny(dj, regions)]
  cnt <- countOverlaps(dj, rp)
  h <- findOverlaps(regions, dj)
  cntlist <- split(cnt[subjectHits(h)], queryHits(h))
  min_cnt <- sapply(cntlist, min)
  min_cnt
}

linkedDuplicatedRanges <- function(object, rpsegs,
                                   flanking_duplications,
                                   params){
  minimum_count <- params[["min_count"]]
  flank <- flanking_duplications
  lengths <- unlist(lapply(flank, elementNROWS))
  ##if(any(lengths != 1)) stop("Flanking regions must be length-one GRanges")
  gapsLeft <- GRanges(seqnames(flank[["left"]]$first),
                      IRanges(end(flank[["left"]]$first)+1,
                              start(flank[["left"]]$last)))
  gapsRight <- GRanges(seqnames(flank[["right"]]$first),
                       IRanges(end(flank[["right"]]$first)+1,
                               start(flank[["right"]]$last)))
  cntsLeft <- minimumBasepairCoverage(rpsegs, gapsLeft)
  cntsRight <- minimumBasepairCoverage(rpsegs, gapsRight)
  left <- lapply(flank$left, "[", which(cntsLeft >= minimum_count))
  right <- lapply(flank$right, "[", which(cntsRight >= minimum_count))
  list(left=left, right=right)##flanking_duplications[cnts >= minimum_count]
}

addFlanks <- function(object, dup_granges){
  ## left-of-amplicon flank
  flank <- dup_granges$left[[1]]
  if(length(flank) > 0){
    amp <- dup_granges$left[[2]]
    edges <- paste(names(flank), names(amp), sep="-")
    index <- match(names(flank), names(ranges(object)))
    new_nodes <- names(ranges(object))[index]
    object <- addNode(new_nodes, object, edges)
  }
  ## right-of-amplicon flank
  flank <- dup_granges$right[[2]]
  if(length(flank) > 0){
    amp <- dup_granges$right[[1]]
    edgR <- paste(names(amp), names(flank), sep="-")
    index <- match(names(flank), names(ranges(object)))
    new_nodes <- names(ranges(object))[index]
    object <- addNode(new_nodes, object, edgR)
  }
  object
}

#' Add focal amplicons that flank an amplicon seed 
#'
#' Amplicons selected to seed a graph have a fold change of at least log2(2.75),
#' or nearly 3-fold the diploid genome. Lower-copy amplicons flanking the seed
#' amplicons are added to the graph if they have a fold change of at least
#' log2(1.75), or nearly 2-fold the diploid genome. To assess whether a low-copy
#' amplicon is 'flanking' an amplicon seed, all read pairs that aligned to the
#' amplicon seeds (see \code{get_readpairs}) are passed to this function in
#' object \code{rp}. The read pairs are converted to segments by
#' \code{readPairsAsSegments} and an assessment of whether flanking duplications
#' are linked is given by \code{linkedDuplicatedRanges}. In particular, this
#' function checks whether there are at least \code{minimum_count} fragments
#' that connect the amplicon seeds to the flanking regions. Finally, the flanks
#' are added to the graph by the \code{addFlanks} function.
#'
#' REFACTORING: overly complicated with many abstractions.  Needs an
#' example to step through
#'
#'
#' @param object a \code{AmpliconGraph}
#'
#' @param rp a \code{GAlignmentPairs} object
#'
#' @param params a list of parameters for amplicon analyses
#' @seealso \code{\link{ampliconParams}}
#' @examples
#' \dontrun{
#'   path <- file.path(system.file(package="trellis"), "tests/testthat")
#'   ag <- readRDS(file.path(path, "addFocalDups.ffab104.rds"))
#'   params <- ampliconParams()
#'   ag <- addFocalDupsFlankingAmplicon(ag, rp, params)
#' }
#' @export
addFocalDupsFlankingAmplicon <- function(object, rp, params){
  if(totalWidth(queryRanges(object))==0) return(object)
  flanks <- flankingDuplications(object,
                                 minimum_foldchange=params[["LOW_THR"]])
  rpsegs <- readPairsAsSegments(rp)
  dup_gr <- linkedDuplicatedRanges(object, rpsegs, flanks, params)
  object <- addFlanks(object, dup_gr)
  object
}

numberAnchored <- function(object, read_pairs){
  seed <- ampliconRanges(object)
  R1_anchored <- overlapsAny(first(read_pairs), seed)
  R2_anchored <- overlapsAny(last(read_pairs), seed)
  number_anchored <- R1_anchored+R2_anchored
  zero_anchor <- read_pairs[number_anchored == 0]
  single_anchor <- read_pairs[number_anchored == 1]
  both_anchored <- read_pairs[number_anchored == 2]
  AnchoredReadPairs(zero_end=zero_anchor,
                    single_end=single_anchor,
                    both_ends=both_anchored)
}

edgeStats <- function(edges, param){
  freq <- table(names(edges))
  freq <- freq[freq >= param$freq]
  if(length(freq)==0) return(data.frame(freq=integer(), maxdist=integer(), bad_bin=logical()))
  stats <- data.frame(freq=as.integer(freq))
  rownames(stats) <- names(freq)
  e2 <- edges[names(edges) %in% rownames(stats)]
  index <- split(seq_len(length(e2)), names(e2))
  ##i <- NULL
  results <- vector("list", length(index))
  for(j in seq_along(index)){
    i <- index[[j]]
    results[[j]] <- start(first(e2))[i]
  }
  maxdist <- sapply(lapply(results, range), diff)
  ##maxdist <- sapply(lapply(foreach(i = index) %do% start(first(e2))[i], range), diff)
  names(maxdist) <- names(index)
  stats$maxdist <- maxdist[rownames(stats)]
  stats$bad_bin <- rep(FALSE, nrow(stats))
  bad_bins <- param$bad_bins
  if(length(bad_bins) == 0 ) return(stats)
  hits1 <- findOverlaps(first(e2), bad_bins[[1]], select="first")
  hits2 <- findOverlaps(last(e2), bad_bins[[2]], select="first")
  if(any(hits1 == hits2, na.rm=TRUE)){
    ## matches a flagged bin pair
    stop("variable flag not defined")
    ##id <- names(e2)[flag]
    ##stats$bad_bin[id] <- TRUE
  }
  ## check the reverse
  hits1 <- findOverlaps(first(e2), bad_bins[[2]], select="first")
  hits2 <- findOverlaps(last(e2), bad_bins[[1]], select="first")
  if(any(hits1 == hits2, na.rm=TRUE)){
    stop("variable flag not defined")
##    id <- names(e2)[flag]
##    stats$bad_bin[id] <- TRUE
  }
  stats
}

edgeFilters <- function(edges, param=FilterEdgeParam()){
  stats <- edgeStats(edges, param)
  valid_edges <- rownames(stats)[stats$maxdist >= param$minimum_maxdist & !stats$bad_bin]
  valid_edges
}

singleAnchorHits <- function(object, read_pair){
  if(length(read_pair) < 1) return(NULL)
  a <- ampliconRanges(object)
  hitfirst <- findOverlaps(first(read_pair), a, select="all", maxgap=1e3)
  g <- nonampliconRanges(object)
  hitlast <- findOverlaps(last(read_pair), g, select="all", maxgap=1e3)
  hitlist <- hitsWithSameQuery(list(hitfirst, hitlast))
  if(!identicalReadQueries(hitlist)){
    hitlist <- dropDuplicatedQueryHits(hitlist)
  }
  hitlist <- hitsWithDifferentTargets(hitlist)
  edges <- orderEdgesByIndexOfOverlap(hitlist, ranges1=a, ranges2=g)
  supporting_read_pair <- read_pair[queryHits(hitlist[[1]])]
  names(supporting_read_pair) <- edges
  supporting_read_pair
}

addDuplications <- function(object, granges, edges){
  if(length(granges) > 0){
    index <- match(unique(names(granges)), names(ranges(object)))
    new_nodes <- names(ranges(object))[index]
    object <- addNode(new_nodes, object, edges)
  }
  object
}

linkFocalDups <- function(object, rp, params){
  LOW_THR <- params[["LOW_THR"]]
  edgeParam <- params[["edge"]]
  if(totalWidth(queryRanges(object))==0) return(object)
  anchors <- numberAnchored(object, rp)
  se_anchors <- singleEndAnchors(anchors)
  e <- singleAnchorHits(object, se_anchors)
  if(is.null(e)) return(object)
  keep <- edgeFilters(e, param=edgeParam)
  e <- e[names(e) %in% keep]
  if(length(e)==0) return(object)
  candidates <- ranges(object)[node2(names(e))]
  is_dup <- isDuplication(candidates, minimum_foldchange=LOW_THR)
  e <- e[is_dup]
  candidates <- candidates[is_dup]
  object <- addDuplications(object, candidates, unique(names(e)))
  object
}

orderEdgesByIndexOfOverlap <- function(hitlist, ranges1, ranges2){
  if(missing(ranges2)) ranges2 <- ranges1
  j <- subjectHits(hitlist[[1]])
  k <- subjectHits(hitlist[[2]])
  jk <- cbind(pmin(j, k), pmax(j,k))
  edges <- paste(names(ranges1)[jk[,1]],
                 names(ranges2)[jk[,2]],
                 sep="-")
  edges
}

hitsWithSameQuery <- function(hitlist){
  qhits <- intersect(queryHits(hitlist[[1]]), queryHits(hitlist[[2]]))
  hitlist <- lapply(hitlist, function(x, index) x[queryHits(x) %in% index], index=qhits)
  hitlist
}

hitsWithDifferentTargets <- function(hitlist){
  j <- subjectHits(hitlist[[1]])
  k <- subjectHits(hitlist[[2]])
  notequal <- j != k
  hitlist <- lapply(hitlist, "[", i=notequal)
  hitlist
}

identicalReadQueries <- function(hitlist)
  identical(queryHits(hitlist[[1]]), queryHits(hitlist[[2]]))

dropDuplicatedQueryHits <- function(hitlist){
  qh1 <- queryHits(hitlist[[1]])
  qh2 <- queryHits(hitlist[[2]])
  d1 <- duplicated(qh1)
  d2 <- duplicated(qh2)
  hitlist[[1]] <- hitlist[[1]][!d1, ]
  hitlist[[2]] <- hitlist[[2]][!d2, ]
  hitlist
}

twoAnchorHits <- function(object, read_pair){
  if(length(read_pair) < 1) return(NULL)
  a <- ampliconRanges(object)
  hitfirst <- findOverlaps(first(read_pair), a, select="all", maxgap=1e3)
  hitlast <- findOverlaps(last(read_pair), a, select="all", maxgap=1e3)
  hitlist <- hitsWithSameQuery(list(hitfirst, hitlast))
  if(!identicalReadQueries(hitlist)){
    hitlist <- dropDuplicatedQueryHits(hitlist)
  }
  hitlist <- hitsWithDifferentTargets(hitlist)
  edges <- orderEdgesByIndexOfOverlap(hitlist, ranges1=a)
  supporting_read_pair <- read_pair[queryHits(hitlist[[1]])]
  names(supporting_read_pair) <- edges
  supporting_read_pair
}

linkAmplicons <- function(object, rp, edgeParam=FilterEdgeParam()){
  rp <- rp[overlapsAny(rp, ampliconRanges(object), maxgap=1e3)]
  if(numNodes(object)<2) return(object)
  anchors_rp <- numberAnchored(object, rp)
  e <- twoAnchorHits(object, bothEndAnchors(anchors_rp))
  if(length(e)==0) return(object)  ##there are no improper pairs linking amplicons
  e <- e[names(e) %in% edgeFilters(e, param=edgeParam)]
  if(length(e)==0) return(object)
  tabe <- table(names(e))
  keep <- names(tabe)[tabe >= edgeParam$freq]
  if (length(keep) == 0) return(object)
  from <-  node1(keep)
  to <- node2(keep)
  existing <- edges(object)[from] ## is 'to' in any of the existing edges
  edge_exists <- mapply(function(to, existing) to %in% existing,
                        to=to, existing=existing)
  if(!all(edge_exists)){
    graph(object) <- addEdge(node1(keep[!edge_exists]),
                         node2(keep[!edge_exists]), graph(object))
  }
  object
}

linkNearAmplicons <- function(object, maxgap=500e3){
  hits <- findOverlaps(ampliconRanges(object), maxgap=maxgap)
  hits <- hits[!isSelfHit(hits)]
  hits <- hits[!isRedundantHit(hits)]
  if(length(hits)==0) return(object)
  new_edges <- paste(names(ampliconRanges(object))[queryHits(hits)],
                     names(ampliconRanges(object))[subjectHits(hits)],
                     sep="-")
  from <-  node1(new_edges)
  to <- node2(new_edges)
  existing <- edges(object)[from] ## is 'to' in any of the existing edges
  edge_exists <- mapply(function(to, existing) to %in% existing,
                        to=to, existing=existing)
  new_edges <- new_edges[!edge_exists]
  if(length(new_edges)==0) return(object)
  graph(object) <- addEdge(node1(new_edges),
                           node2(new_edges), graph(object))
  object
}

germline <- function(object){
  reduce(c(germlineCNV(object), outliers(object)))
}

#' Identify lower copy focal amplicons (possibly duplicatons) not already
#' annotated as amplicons and not contained within the germline filters
#'
#' Select all duplicated segments (see \code{isDuplication}) less than `maxgap`
#' kb from the seed amplicons of the graph that are not within a germline CNV or
#' outlier. Germline CNVs and outliers can be conveniently extracted from the
#' graph object as given in the examples.
#'
#' @export
#'
#' @param object A \code{AmpliconGraph}
#' @param params a list of parameters such as that generated by \code{ampliconParams}.
#' @return a reduced \code{GRanges} of lower-copy amplicons
#' @seealso \code{\link{ampliconParams}}
#' @examples
#'   ## load a previously saved AmpliconGraph from a regression unit test
#'   \dontrun{
#'     path <- system.file("testthat", package="trellis")
#'     ag <- readRDS(file.path(path, "addFocalDups.ffab104.rds"))
#'     params <- ampliconParams()
#'     focalAmpliconDupRanges(ag, params)
#' }
#' @export
focalAmpliconDupRanges <- function(object, params){
  LOW_THR <- params[["LOW_THR"]]
  MAX_SIZE <- params[["maxgap"]]
  g <- ampliconRanges(object)
  is_dup <- isDuplication(ranges(object), minimum_foldchange=LOW_THR)
  ## remove any lower copy ranges that are very large
  dup_g <- ranges(object)[is_dup]
  dup_g <- dup_g[width(dup_g) < MAX_SIZE]
  g <- sort(c(dup_g, g))
  ##
  ## TODO: add expansion to params
  ##
  if(length(g) > 0){
    g <- expandGRanges(g, 1e3)
  }
  ## we do not want to link germline events
  germ <- germline(object)
  ##
  ## TODO: add expansion to params
  ##
  germ <- expandGRanges(germ, 10e3)
  germ <- reduce(germ)
  g <- filterBy(g, germ, type="within")
  reduce(g)
}

removeNode2 <- function (node, object) {
  ##object <- clearNode(node, object)
  nN <- nodes(object)
  node <- node[node %in% nN]
  if(length(node) == 0) return(object)
  wh <- match(node, nN)
  gN <- nN[-wh]
  nE <- object@edgeL
  nE <- nE[gN]
  object@edgeL <- nE
  identical(names(nE), gN)
  ##nE <- object@edgeL[-wh]
  nE2 <- lapply(nE, function(el) {
    ## Nodes that never linked to anything to begin with
    if(length(el$edges) == 0) return(el)
    oldN <- nN[el$edges]
    result <- match(oldN, gN)
    result <- as.integer(result[!is.na(result)])
    el$edges <- result
    el
  })
  object@nodes <- gN
  object@edgeL <- nE2
  object
}

filterSmallAmplicons <- function(object, MIN_SIZE=5e3){
  ar <- reduce(ampliconRanges(object), min.gapwidth=5e3)
  ar <- ar[width(ar) < 5e3]
  if(length(ar) > 0){
    j <- subjectHits(findOverlaps(ar, ranges(object)))
    gN <- removeNode2(names(ranges(object))[j], graph(object))
    graph(object) <- gN
  }
  object
}

updateRangesMetadata <- function(object, granges){
  index <- match(names(granges), names(ranges(object)))
  rg <- ranges(object)[-index]
  rg2 <- sort(c(rg, granges))
  ranges(object) <- rg2
  object
}

#' Assign a group id for nodes that are linked
#'
#' Takes a list of amplicons (GRanges object) and an undirected graph
#' representation of the amplicons
#'
#' @export
#' @param gN a \code{AmpliconGraph} object
groupAmplicons <- function(gN){
  V <- nodes(gN)
  if(numEdges(gN) > 0){
    kgroups <- kCliques(gN)
    groups <- kgroups[[length(kgroups)]]
    grouped.amplicon <- rep(NA, length(V))
    for(k in seq_along(groups)){
      grouped.amplicon[match(groups[[k]], V)] <- k
    }
    if(any(is.na(grouped.amplicon))){
      grouped.amplicon[is.na(grouped.amplicon)] <- max(grouped.amplicon, na.rm=TRUE) +
        seq_len(sum(is.na(grouped.amplicon)))
    }
    grouped.amplicons <- as.integer(factor(grouped.amplicon))
  } else grouped.amplicon <- seq_len(length(V))
  names(grouped.amplicon) <- V
  factor(grouped.amplicon)
}

setAmpliconGroups <- function(object){
  groups <- groupAmplicons(graph(object))
  groups <- setNames(paste0("gr", groups), names(groups))
  ar <- ampliconRanges(object)
  groups <- groups[names(ar)]
  ar$groups <- groups
  object <- updateRangesMetadata(object, ar)
  object
}

getGenes <- function(object, transcripts){
  .hgnc <- setNames(rep(NA, length(object)), names(object))
  hits <- findOverlaps(transcripts, object, type="within")
  ##genes <- hgnc(transcripts)[queryHits(hits)]
  genes <- transcripts$gene_name[queryHits(hits)]
  amp <- names(object)[subjectHits(hits)]
  geneL <- lapply(split(genes, amp), function(x) paste(unique(x), collapse=", "))
  genes <- unlist(geneL)
  .hgnc[names(genes)] <- genes
  object$hgnc <- as.character(.hgnc)
  object
}

setGenes <- function(object, transcripts){
  ar <- getGenes(ampliconRanges(object), transcripts)
  object <- updateRangesMetadata(object, ar)
  object
}

## Two levels of significance for genes:
##    - clinical significance (synonymous with driver)
##
##    - biologically significant: perhaps biologically significant but unkown
##      clinical significance. This set of all clinically significant
##      genes is a subset
driver_genes <- function(tx, clin_sign=FALSE){
  if(clin_sign){
    return(tx$gene_name[tx$cancer_connection])
  }
  tx$gene_name [ tx$biol_sign ]
}

getDrivers <- function(object, transcripts, clin_sign=TRUE){
  known_drivers <- unique(driver_genes(transcripts, clin_sign=clin_sign))
  genes <- object$hgnc
  gene_list <- split(genes, object$groups)
  gene_list <- lapply(gene_list, function(x){
    x <- x[!is.na(x)]
    if(length(x)==0) return(NA)
    xx <- unlist(strsplit(x, ", "))
  })
  driver_in_grouped_amplicon <- lapply(gene_list, function(x){
    x <- paste(x[x %in% known_drivers], collapse=", ")
    if(x == "") x <- NA
    x
  })
  driver_in_grouped_amplicon <- unlist(driver_in_grouped_amplicon)
  if(all(is.na(driver_in_grouped_amplicon))) return(object)
  drv <- driver_in_grouped_amplicon[!is.na(driver_in_grouped_amplicon)]
  for(i in seq_along(drv)){
    driver_group <- names(drv)[i]
    if(clin_sign){
      object$driver[object$groups == driver_group] <- drv[i]
    } else {
      object$biol_sign[ object$groups == driver_group ] <- drv[i]
    }
  }
  if(clin_sign){
    object$driver <- as.character(object$driver)
  } else {
    object$biol_sign <- as.character(object$biol_sign)
  }
  object
}

setDrivers <- function(object, transcripts, clin_sign=TRUE){
  ar <- ampliconRanges(object)
  ar <- getDrivers(ar, transcripts, clin_sign=clin_sign)
  object <- updateRangesMetadata(object, ar)
  object
}

#' Initialize a graph object for amplicon analyses
#'
#'
#' @param params a list of parameters
#' @param af a list of germline filters
#' @param segs a \code{GRanges} object from the segmentation of the normalized coverage
#' @seealso \code{\link{ampliconParams}}
#' @export
#' @examples
#'   library(svfilters.hg19)
#'   library(svbams)
#'   library(Rsamtools)
#'   data(germline_filters)
#'   ##
#'   ## read in some CNVs
#'   ##
#'   cv.extdata <- system.file("extdata", package="svbams")
#'   segs <- readRDS(file.path(cv.extdata, "cgov44t_segments.rds"))
#'   extdata <- system.file("extdata", package="svbams")
#'   bview <- BamViews(bamPaths=file.path(extdata, "cgov44t_revised.bam"))
#'   params <- ampliconParams()
#'   germline_filters[["germline_cnv"]] <- GRanges()
#'   germline_filters[["outliers"]] <- GRanges()
#'   makeAGraph(segs, germline_filters, params)
makeAGraph <- function(segs, af, params){
  segs$is_amplicon <- segs$seg.mean > params$AMP_THR
  ag <- AmpliconGraph(ranges=segs,
                      filters=af,
                      params=params)
  if (numNodes (ag) == 0) return (ag)
  ag <- trimRangesOverlappingCentromere (ag, af[["centromeres"]])
  ag
}

makeAGraph2 <- function(segs, af, params){
  ag <- AmpliconGraph(ranges=segs,
                      filters=af,
                      params=params)
  if (numNodes (ag) == 0) return (ag)
  ag <- trimRangesOverlappingCentromere (ag, af[["centromeres"]])
  ag
}

#' Construct an AmpliconGraph from a BamViews object
#'
#' This function constructs an \code{AmpliconGraph} from a
#' \code{BamViews} object for a single sample.  By default, the seeds
#' of the graph are focal amplicons with fold-change of nearly 3
#' relative to the diploid genome (log2(2.75)).  The threshold of
#' seeding amplicons can be adjusted by the \code{ampliconParams}
#' function. After seeding the graph with high-copy focal amplicons,
#' both neighboring (flanking) and distant low-copy focal amplicons
#' are added to the graph object.  Next, improperly paired reads in
#' which both the first and last read align to any queryRange of the
#' graph object are parsed from the bam file ( this function will
#' throw an error if not all files in \code{bamPaths} exist).  If 5 or
#' more improperly paired reads bridge a node to another node, these
#' amplicons are grouped.  Further, if a low-copy amplicon is bridged
#' to an existing node, the low-copy amplicon will become a node in
#' the graph. Amplicon groups are defined by the edges between nodes,
#' where the edges represent improperly paired reads that support a
#' junction between two amplicons.
#'
#' REFACTORING: needs an example to step through.  Perhaps an initial
#' graph object and then keep updating the graph object and associated
#' visualization with each step.  Each of the low level functions in
#' sv_deletions should be exported to more fully document this
#' procedure.
#'
#' @seealso See \code{ampliconParams} for default parameters. The
#'   wrapper \code{\link{sv_amplicon_exp}} constructs and saves an
#'   \code{\linkS4class{AmpliconGraph}} for each sample in an
#'   experiment.
#'
#' @export
#' @param bview a \code{BamViews} object
#'
#' @param segs a \code{GRanges} object of segments with log2
#'   fold-changes consistent with amplification
#'
#' @param amplicon_filters a \code{list} of filters
#' @param params a list of parameters for the amplicon analysis
#' @param transcripts a \code{GRanges} object of transcripts
sv_amplicons <- function(bview, segs, amplicon_filters,
                         params, transcripts){
  ag <- initialize_graph(segs, amplicon_filters, params)
  REMOTE <- file.exists(bamPaths(bview))
  if(!REMOTE) stop ("Path to BAM files is invalid")
  ##
  ## REFACTOR: could this step use the saved improper read pairs
  ##
  rp <- get_readpairs(ag, bamPaths(bview))
  ag <- addFocalDupsFlankingAmplicon(ag, rp, params)
  queryRanges(ag) <- focalAmpliconDupRanges(ag, params)
  irp <- get_improper_readpairs(ag, bamPaths(bview))
  ##
  ## At this point, focal duplications added to the graph have not
  ## been linked to any of the seeds
  ##
  ##paired_bin_filter <- af[["paired_bin_filter"]]
  ##param <- FilterEdgeParam(minimum_maxdist=50, bad_bins=paired_bin_filter)
  ##param <- FilterEdgeParam(minimum_maxdist=50, bad_bins=GRanges())
  edge.p <- params[["edge"]]
  ag <- linkFocalDups(ag, irp, params)
  ag <- linkAmplicons(ag, irp, edgeParam=edge.p)
  ag <- linkNearAmplicons(ag, maxgap=params[["maxgap"]])
  ag <- filterSmallAmplicons (ag)
  ag <- setAmpliconGroups (ag)
  ag <- setGenes (ag, transcripts)
  ag <- setDrivers (ag, transcripts, clin_sign=TRUE)
  ag <- setDrivers (ag, transcripts, clin_sign=FALSE)
  ag
}

initialize_graph <- function(segs, amplicon_filters, params){
  ag <- makeAGraph(segs, amplicon_filters, params)
  merged <- joinNearGRanges(ranges(ag), params)
  names(merged) <- ampliconNames(merged)
  ## the names of the nodes no longer correspond to the range names
  ## stopifnot(nodes(ag) %in% names(tmp)), and so
  ## setAmpliconGroups fails
  ranges(ag) <- merged
  ag
}


ampliconFilters <- function(genome){
  pkg <- paste0("svfilters.", genome)
  data("germline_filters", package=pkg, envir=environment())
  germline_filters <- get("germline_filters")
  germline_filters
}

amplified_segments <- function(segments, params){
  segments$is_amplicon <- segments$seg.mean > params$AMP_THR
  segments
}

initialize_graph2 <- function(preprocess, filters, params){
  ag <- makeAGraph2(preprocess$segments, filters, params)
  merged <- joinNearGRanges(ranges(ag), params)
  names(merged) <- ampliconNames(merged)
  ## the names of the nodes no longer correspond to the range names
  ## stopifnot(nodes(ag) %in% names(tmp)), and so
  ## setAmpliconGroups fails
  ranges(ag) <- merged
  ag
}

add_amplicons <- function(ag, bam.file, params){
  proper_rp <- get_readpairs2(queryRanges(ag), bam.file)
  ag <- addFocalDupsFlankingAmplicon(ag, proper_rp, params)
  queryRanges(ag) <- focalAmpliconDupRanges(ag, params)
  ag
}

#' Construct an AmpliconGraph from a BamViews object
#'
#' This function constructs an \code{AmpliconGraph} from a
#' \code{BamViews} object for a single sample.  By default, the seeds
#' of the graph are focal amplicons with fold-change of nearly 3
#' relative to the diploid genome (log2(2.75)).  The threshold of
#' seeding amplicons can be adjusted by the \code{ampliconParams}
#' function. After seeding the graph with high-copy focal amplicons,
#' both neighboring (flanking) and distant low-copy focal amplicons
#' are added to the graph object.  Next, improperly paired reads in
#' which both the first and last read align to any queryRange of the
#' graph object are parsed from the bam file ( this function will
#' throw an error if not all files in \code{bamPaths} exist).  If 5 or
#' more improperly paired reads bridge a node to another node, these
#' amplicons are grouped.  Further, if a low-copy amplicon is bridged
#' to an existing node, the low-copy amplicon will become a node in
#' the graph. Amplicon groups are defined by the edges between nodes,
#' where the edges represent improperly paired reads that support a
#' junction between two amplicons.
#'
#'
#' @seealso See \code{ampliconParams} for default parameters. The
#'   wrapper \code{\link{sv_amplicon_exp}} constructs and saves an
#'   \code{\linkS4class{AmpliconGraph}} for each sample in an
#'   experiment.
#'
#' @export
#' @param preprocess a list of the preprocessed data (see \code{preprocessData})
#' @param amplicon_filters a \code{GRanges} object of germline filters.  If this
#' argument is not specified then a default set of germline filters will be used
#' from the \code{svfilters.hg19} package if the \code{genome} slot in \code{preprocess} is
#' set to "hg19" or from the \code{svfilters.hg18} package if it's set to "hg18".
#' @param params a list of parameters for the amplicon analysis.  Default parameters
#' are given by \code{AmpliconParams()}.
#' @return an \code{AmpliconGraph} object
#' @examples
#' data(pdata)
#' pdata$bam.file <- system.file("extdata", "cgov44t_revised.bam",
#'                               package="svbams")
#' sv_amplicons2(pdata)
#' @export
sv_amplicons2 <- function(preprocess, amplicon_filters,
                          params=ampliconParams()){
  if(missing(amplicon_filters)){
    amplicon_filters <- ampliconFilters(preprocess$genome)
  }
  preprocess$segments <- amplified_segments(preprocess$segments, params)
  ag <- initialize_graph2(preprocess, amplicon_filters, params)
  if (length(ag) == 0) return(ag)
  ##
  ## Requires bam file -- can not work remotely
  ##
  ag <- add_amplicons(ag, preprocess$bam.file, params)
  improper_rp <- preprocess$read_pairs[["improper"]]
  ag <- link_amplicons(ag, improper_rp, params)
  tx <- loadTx(preprocess$genome)
  ag <- annotate_amplicons(ag, tx)
  ag
}

link_amplicons <- function(ag, improper_rp, params){
  edge.p <- params[["edge"]]
  ag <- linkFocalDups(ag, improper_rp, params)
  ag <- linkAmplicons(ag, improper_rp, edgeParam=edge.p)
  ag <- linkNearAmplicons(ag, maxgap=params[["maxgap"]])
  ag <- filterSmallAmplicons (ag)
  ag <- setAmpliconGroups (ag)
  ag
}

annotate_amplicons <- function(ag, tx){
  ag <- setGenes (ag, tx)
  ag <- setDrivers (ag, tx, clin_sign=TRUE)
  ag <- setDrivers (ag, tx, clin_sign=FALSE)
  ag
}




#' Identify somatic amplicons and grouped amplicons
#'
#' This function identifies focal, somatic amplifications.  As
#' amplicons are often grouped by the bridge-fusion-breakage cycles,
#' we represent the set of somatic amplicons identified for a sample
#' as a graph. The nodes of the graph are the amplicons and the edges
#' are the links between amplicons informed by paired reads. This
#' function is a wrapper for \code{sv_amplicons} that constructs an
#' \code{AmpliconGraph} for a given sample.  As with the other
#' *Experiment functions, \code{sv_amplicon_exp} saves the
#' \code{AmpliconGraph} computed for each sample as in intermediate
#' file for quick recall.  If the file exists, this function reads the
#' serialized R object from disk. To generate an \code{AmpliconGraph}
#' de novo, one must first delete the intermediate files (see
#' examples).
#'
#' @seealso See \code{\linkS4class{AmpliconGraph}} for methods
#'   associated with the class and \code{\link{sv_amplicons}} for
#'   construction of an \code{AmpliconGraph} for a single sample.
#' 
#'
#' @param dirs character-vector of file paths for storing intermediate
#'   files
#' @param bviews A \code{BamViews} object
#' @param grl A \code{GRangesList} of the segmented genomes (each
#'   element is the \code{GRanges} for a sample)
#' @param amplicon_filters A list of germline filters and parameters. See the \code{germline_filters} object in the package \code{svfilters.<build>}.
#' @param params  a list of parameters for the amplicon analysis
#' @param transcripts a \code{GRanges} object of the transcripts.  
sv_amplicon_exp <- function(dirs, bviews, grl, amplicon_filters,
                            params=ampliconParams(),
                            transcripts){
  ag_files <- file.path(dirs["2amplicons"],
                        paste0(colnames(bviews), ".rds"))
  result_list <- setNames(vector("list",
                                 length(ag_files)),
                          colnames(bviews))
  for(i in seq_along(result_list)){
    if(file.exists(ag_files[i])){
      ag <- readRDS(ag_files[i])
      result_list[[i]] <- ag
      next()
    }
    ag <- sv_amplicons(bviews[, i], segs=grl[[i]],
                       amplicon_filters, params, transcripts)
    result_list[[i]] <- ag
    saveRDS(ag, file=ag_files[i])
  }
  result_list
}




recurrentAmplicons <- function(tx, grl, maxgap=5e3){
  ## tx is big.  Make this smaller as a first step
  tx <- subsetByOverlaps(tx, reduce(unlist(grl), min.gapwidth=maxgap))
  gene_list <- split(tx, tx$gene_name)
  ## remove any element that has multiple chromosomes
  nchroms <- sapply(gene_list, function(g) length(unique(chromosome(g))))
  gene_list <- gene_list[nchroms == 1]
  ## A gene can have multiple entries.  Try reduce
  gene_list <- GRangesList(lapply(gene_list, reduce, min.gapwidth=maxgap))
  ##
  ## Add back the gene name
  ##
  el <- elementNROWS(gene_list)
  tx <- unlist(gene_list)
  tx$gene_name <- rep(names(gene_list), el)
  ## ensure that 2 amplicons for a subject hitting a gene are only counted once
  is_overlap_list <- lapply(grl, function(gr, tx, maxgap) {
    overlapsAny(tx, gr, maxgap=maxgap)
  }, maxgap=maxgap, tx=tx)
  is_overlap <- do.call(cbind, is_overlap_list)
  cnts <- rowSums(is_overlap)
  tx <- tx[cnts > 1, ]
  is_overlap <- is_overlap[ cnts > 1, ]
  cnts <- cnts[ cnts > 1 ]

  ids <- apply(is_overlap, 1, function(is_amplicon, id) {
    paste(id[is_amplicon], collapse=",")
  }, id=gsub(".bam", "", colnames(is_overlap)))
  result <- data.frame(gene = tx$gene_name, freq=as.integer(cnts), id=ids)
  ##
  ## Gene coordinates
  ##
  tx2 <- tx[tx$gene_name %in% result$gene]
  tx2.list <- GRangesList(sapply(split(tx2, tx2$gene_name), reduce))
  tx2 <- unlist(tx2.list)
  tx2 <- tx2[result$gene]
  stopifnot(identical(names(tx2), as.character(result$gene)))
  result$chr <- chromosome(tx2)
  result$start <- start(tx2)
  result$end <- end(tx2)
  result <- result[, c("gene", "chr", "start", "end", "freq", "id")]
  rownames(result) <- NULL
  result[order(result$freq, decreasing=TRUE), ]
}

#' Table of recurrent drivers
#'
#' Given a GRangesList of amplicons or deletions (each element of list is a
#' sample), tabulate the frequency of driver deletions or amplifications.
#' 
#' @param grl a \code{GRangesList} organized by sample
#' @param transcripts a \code{GRanges} object of transcripts as provided by the
#'   \code{svfilters.<ucsc_build>} packages
#' @param split A character string indicating the string used to separate
#'   drivers. By default, we assume the drivers associated with a given deletion
#'   or amplicon are separated by a comma followed by a space.
#' @return a \code{data.frame} of recurrent drivers
#' @export
recurrentDrivers <- function(grl, transcripts, split=", "){
  driver_list <- sapply(grl, function(g){
    dr <- as.character(g$driver)
    if(length(dr) == 0) return(NULL)
    dr <- dr[!is.na(dr)]
    dr <- unlist(strsplit(dr, split))
    unique(dr)
  })
  driver_list2 <- driver_list[ elementNROWS(driver_list) > 0 ]
  df <- data.frame(id=rep(names(driver_list2), elementNROWS(driver_list2)),
                   gene=unlist(driver_list2))
  dflist <- split(df, df$gene)
  ids <- sapply(sapply(dflist, "[[", "id"), paste, collapse=",")
  df <- data.frame(gene=names(dflist),
                   id=ids,
                   freq=elementNROWS(dflist))
  tx <- transcripts[transcripts$gene_name %in% df$gene]
  txlist <- split(tx, tx$gene_name)
  tx <- unlist(reduce(txlist, min.gapwidth=2e3))
  df$chr <- chromosome(tx)
  df$start <- start(tx)
  df$end <- end(tx)
  df <- df[order(df$freq, decreasing=TRUE), ]
  df
}

#' @include AllGenerics.R
NULL

## #' Plot amplicons as nodes and the links between amplicons as edges
## #'
## #' Linked amplicons are represented as a \code{AmpliconGraph}. With
## #' amplicons as nodes and edges representing links between amplicons
## #' from paired reads, a \code{graphNEL} object encapsulates this data.
## #' Plotting utilities in the \code{Rgraphviz} package can be used to
## #' visualize these graphs.
## #'
## #' @export
## #'
## #' @seealso See \code{\link{sv_amplicons}} for constructing
## #'   an \code{AmpliconGraph} object. See
## #'   \code{\link{qualitativeColors}} for a list of qualitative
## #'   colors. See \code{\link[graph]{graphNEL-class}} for details on
## #'   graph representation.
## #'
## #' @param ag a \code{AmpliconGraph} object
## #' @param col_list a list of color palettes
## plot_amplicons <- function(ag, col_list=qualitativeColors()){
##   if(length(ag)==0){
##     message("No amplicons -- nothing to plot")
##     return(invisible())
##   }
##   dark_colors <- c("#332288", "#661100", "#882255", "#AA4499")
##   ar <- nodes(ag)
##   chroms <- sapply(strsplit(ar, ":"), "[", 1)
##   sl <- unique(chroms)
##   L <- length(sl)
##   L <- ifelse(L > 12, 12, L)
##   sl <- factor(chroms, levels=sl)
##   color_nodes <- col_list[[L]][as.integer(sl)]
##   g1 <- graph(ag)
##   color_nodes <- setNames(color_nodes, nodes(g1))
##   nodenames <- setNames(nodes(g1), nodes(g1))
##   text_col <- setNames(rep("black", length(nodes(g1))), nodes(g1))
##   text_col[color_nodes %in% dark_colors] <- "gray90"
##   nodeRenderInfo(g1) <- list(label=nodenames, fill=color_nodes, textCol=text_col)
##   nodeAttrs <- list(fillcolor=color_nodes)
##   attrs <- list(node=list(shape="rectangle",
##                           fixedsize=FALSE),
##                 graph=list(rankdir="LR"))
##   graph_object <- layoutGraph(g1,
##                               attrs=attrs,
##                               nodeAttrs=nodeAttrs)
##   graph_object
## }
cancer-genomics/trellis documentation built on June 8, 2024, 6:01 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
cancer-genomics/trellis
Somatic structural variant analysis

R/amplicon-utils.R
In cancer-genomics/trellis: Somatic structural variant analysis

Documented in addFocalDupsFlankingAmplicon AmpliconGraph ampliconParams focalAmpliconDupRanges groupAmplicons joinNearGRanges makeAGraph recurrentDrivers sv_amplicon_exp sv_amplicons sv_amplicons2

R Package Documentation

Browse R Packages

We want your feedback!

cancer-genomics/trellis Somatic structural variant analysis

R/amplicon-utils.R In cancer-genomics/trellis: Somatic structural variant analysis

Documented in addFocalDupsFlankingAmplicon AmpliconGraph ampliconParams focalAmpliconDupRanges groupAmplicons joinNearGRanges makeAGraph recurrentDrivers sv_amplicon_exp sv_amplicons sv_amplicons2

R Package Documentation

Browse R Packages

We want your feedback!

cancer-genomics/trellis
Somatic structural variant analysis

R/amplicon-utils.R
In cancer-genomics/trellis: Somatic structural variant analysis