R/bambu-extendAnnotations-utilityExtend.R

Defines functions addGeneIdsToReadClassTable isore.estimateDistanceToAnnotations combineWithAnnotations includeOverlapReadClass addNewUnsplicedReadClasses genFilteredAnTable calculateDistToAnnotation assignGeneIDbyMaxMatch updateWIntronMatches addNewSplicedReadClasses createExonByReadClass makeExonsIntronsSpliced calculateNDR calculateNDROnTranscripts recommendNDR filterTranscriptsByAnnotation filterTranscripts isore.extendAnnotations

#' Extend annotations
#' @inheritParams bambu
#' @noRd
isore.extendAnnotations <- function(combinedTranscripts, annotationGrangesList,
                                    remove.subsetTx = TRUE,
                                    min.sampleNumber = 1, NDR = NULL, min.exonDistance = 35, min.exonOverlap = 10,
                                    min.primarySecondaryDist = 5, min.primarySecondaryDistStartEnd = 5, 
                                    min.readFractionByEqClass = 0, fusionMode = FALSE,
                                    prefix = "Bambu", baselineFDR = 0.1, defaultModels = NULL, verbose = FALSE){
  combinedTranscripts <- filterTranscripts(combinedTranscripts, min.sampleNumber)
  if (nrow(combinedTranscripts) > 0) {
    group_var <- c("intronStarts","intronEnds","chr","strand","start","end",
                   "confidenceType","readCount", "maxTxScore", "maxTxScore.noFit")
    rowDataTibble <- select(combinedTranscripts,all_of(group_var))
    annotationSeqLevels <- seqlevels(annotationGrangesList)
    rowDataSplicedTibble <- filter(rowDataTibble,
                                   confidenceType == "highConfidenceJunctionReads")
    transcriptRanges <- makeExonsIntronsSpliced(
      rowDataSplicedTibble, annotationSeqLevels)
    confidenceTypeVec <- rowDataTibble$confidenceType
    if(nrow(rowDataSplicedTibble)>0){
        rowDataFilteredSpliced <- addNewSplicedReadClasses(transcriptRanges,
                                                       rowDataSplicedTibble, annotationGrangesList, 
                                                       min.exonDistance, min.primarySecondaryDist,
                                                       min.primarySecondaryDistStartEnd, verbose)
    } else{ rowDataFilteredSpliced = NULL}
    rowDataFilteredUnspliced <- rowDataTibble[which(confidenceTypeVec == "unsplicedNew"),]
    SEnRng <- addNewUnsplicedReadClasses(rowDataFilteredUnspliced, 
                                         rowDataFilteredSpliced, transcriptRanges$exons, 
                                         annotationGrangesList, min.exonOverlap, verbose)
    rowDataCombined <- SEnRng$rowDataCombined
    exonRangesCombined <- SEnRng$exonRangesCombined
    # assign gene IDs based on exon match
    geneIds <- assignGeneIds(grl = exonRangesCombined,
                             annotations = annotationGrangesList,
                             min.exonOverlap = min.exonOverlap,
                             fusionMode = fusionMode)
    rowDataCombined$GENEID <- geneIds[,1]
    rowDataCombined$novelGene <- geneIds[,2]
    if(fusionMode) rowDataCombined$readClassType[geneIds[,3]] <- 'fusionTranscript'
    
    # ## filter out transcripts
    extendedAnnotationRanges <- filterTranscriptsByAnnotation(
      rowDataCombined, annotationGrangesList, exonRangesCombined, prefix,
      remove.subsetTx, min.readFractionByEqClass, baselineFDR, NDR, defaultModels, verbose)
    return(extendedAnnotationRanges)
  } else {
    message("The current filtering criteria filters out all new read 
            classes, please consider less strigent critria!")
    return(annotationGrangesList)
  }
}

#' filter transcripts by read counts
#' @noRd
filterTranscripts <- function(combinedTranscripts, min.sampleNumber){
  filterSet = NULL
  if (nrow(combinedTranscripts) > 0){
    #filter by read counts
    naTxscore <- is.na(combinedTranscripts$NSampleTxScore)
    if(any(naTxscore)) 
       combinedTranscripts$NSampleTxScore[naTxscore] <- 0
    filterSet <- combinedTranscripts$NSampleReadCount >= min.sampleNumber & (
        combinedTranscripts$NSampleTxScore >= min.sampleNumber) & (
        combinedTranscripts$NSampleReadProp >= min.sampleNumber)
  }
  combinedTranscripts = combinedTranscripts[filterSet,]
  return(combinedTranscripts)
}

#' calculate minimum equivalent classes for extended annotations
#' @importFrom dplyr select as_tibble %>% mutate_at mutate group_by 
#'     ungroup .funs .name_repair vars 
#' @noRd
filterTranscriptsByAnnotation <- function(rowDataCombined, annotationGrangesList,
                                          exonRangesCombined, prefix,  remove.subsetTx, 
                                          min.readFractionByEqClass, baselineFDR = 0.1, 
                                          NDR = NULL, defaultModels = NULL, verbose) {
  start.ptm <- proc.time() # (1) based on transcript usage
  
  #calculate relative read count before any filtering
  rowDataCombined <- group_by(rowDataCombined, GENEID) %>% mutate(relReadCount = readCount/sum(readCount))
  if (remove.subsetTx) { # (1) based on compatibility with annotations
    notCompatibleIds <- which(!grepl("compatible", rowDataCombined$readClassType) |
        rowDataCombined$readClassType == "equal:compatible") #keep equal for FDR calculation
    exonRangesCombined <- exonRangesCombined[notCompatibleIds]
    rowDataCombined <- rowDataCombined[notCompatibleIds,]
  }
  #(2) remove transcripts below NDR threshold/identical junctions to annotations
  rowDataCombined = calculateNDROnTranscripts(rowDataCombined, 
                        useTxScore = length(annotationGrangesList)==0)

  if(length(annotationGrangesList)>0){ #only recommend an NDR if its possible to calculate an NDR
      NDR = recommendNDR(rowDataCombined, baselineFDR, NDR, defaultModels, verbose)
  } else {
      if(is.null(NDR)) NDR = 0.5
  }
  filterSet = (rowDataCombined$NDR <= NDR)
  exonRangesCombined <- exonRangesCombined[filterSet]
  rowDataCombined <- rowDataCombined[filterSet,]
  #calculate relative subset read count after filtering (increase speed, subsets are not considered here)
  mcols(exonRangesCombined)$txid <- seq_along(exonRangesCombined)
  minEq <- getMinimumEqClassByTx(exonRangesCombined)$eqClassById
  rowDataCombined$relSubsetCount <- rowDataCombined$readCount/unlist(lapply(minEq, function(x){return(sum(rowDataCombined$readCount[x]))}))
  #post extend annotation filters applied here (currently only subset filter)
  if(min.readFractionByEqClass>0 & sum(filterSet)>0) { # filter out subset transcripts based on relative expression
    filterSet <- rowDataCombined$relSubsetCount > min.readFractionByEqClass
    exonRangesCombined <- exonRangesCombined[filterSet]
    rowDataCombined <- rowDataCombined[filterSet,]
  }
  if(sum(filterSet==0) & length(annotationGrangesList)==0) stop(
    "WARNING - No annotations were provided. Please increase NDR threshold to use novel transcripts")
  if(sum(filterSet)==0) message("WARNING - No novel transcripts meet the given thresholds. Try a higher NDR.")
  # (3) combine novel transcripts with annotations
  extendedAnnotationRanges <- combineWithAnnotations(
    rowDataCombined, exonRangesCombined, 
    annotationGrangesList, prefix)
  metadata(extendedAnnotationRanges)$NDRthreshold = NDR
  end.ptm <- proc.time()
  if (verbose) message("transcript filtering in ",
                       round((end.ptm - start.ptm)[3] / 60, 1), " mins.")
  return(extendedAnnotationRanges)
}

#' calculates an expected NDR based on the annotations'
#' @noRd
recommendNDR <- function(combinedTranscripts, baselineFDR = 0.1, NDR = NULL, defaultModels = defaultModels, verbose = FALSE){
    if(verbose) message("-- Predicting annotation completeness to determine NDR threshold --")
    equal = combinedTranscripts$readClassType == "equal:compatible"
    equal[is.na(equal)] = FALSE

    #add envirnment so poly() works
    attr(defaultModels$lmNDR[["terms"]], ".Environment") <- new.env(parent = parent.env(globalenv()))
    baseline = predict(defaultModels$lmNDR, newdata=data.frame(NDR=baselineFDR))
    attr(defaultModels$lmNDR[["terms"]], ".Environment") = c()

    NDRscores = calculateNDR(combinedTranscripts$maxTxScore.noFit, equal)
    score = combinedTranscripts$maxTxScore.noFit
    NDR.rec = predict(lm(NDRscores~poly(score,3,raw=TRUE)), newdata=data.frame(score=baseline))
    NDR.rec = round(NDR.rec,3)
    if(verbose) message("Recommended NDR for baseline FDR of ", baselineFDR, " = ", NDR.rec)
    if (NDR.rec < 0) NDR.rec = 0
    if(NDR.rec > 0.5){
        message("A high NDR threshold is being recommended by Bambu indicating high levels of novel transcripts, ",
        "limiting the performance of the trained model")
        message("We recommend training a new model on similiar but well annotated dataset if available ",
        "(https://github.com/GoekeLab/bambu/tree/master#Training-a-model-on-another-speciesdataset-and-applying-it)",
        ", or alternatively running Bambu with opt.discovery=list(fitReadClassModel=FALSE)")
    }
    
    #if users are using an NDR let them know if the recommended NDR is different
    if(is.null(NDR)) 
    {
        NDR = NDR.rec
        message("Using a novel discovery rate (NDR) of: ", NDR)
    }
    else{
        if(abs(NDR.rec-NDR)>=0.1){
            message(paste0("For your combination of sample and reference annotations we recommend an NDR of ", NDR.rec,
            ". You are currently using an NDR threshold of ", NDR, 
            ". A higher NDR is suited for samples where the reference annotations are poor and more novel transcripts are expected,", 
            "whereas a lower NDR is suited for samples with already high quality annotations"))
        }
    }
    return(NDR)
}


#' Calculate NDR based on transcripts 
#' @noRd
calculateNDROnTranscripts <- function(combinedTranscripts, useTxScore = FALSE){
      # calculate and filter by NDR
    equal = combinedTranscripts$readClassType == "equal:compatible"
    equal[is.na(equal)] = FALSE
    if(sum(equal, na.rm = TRUE)<50 | sum(!equal, na.rm = TRUE)<50 | useTxScore){
          combinedTranscripts$NDR = 1 - combinedTranscripts$maxTxScore
          if(!useTxScore) message("WARNING - Less than 50 TRUE or FALSE read classes ",
            "for NDR precision stabilization.")
          message("NDR will be approximated as: (1 - Transcript Model Prediction Score)")
    } else combinedTranscripts$NDR = calculateNDR(combinedTranscripts$maxTxScore, equal)
    return(combinedTranscripts)
}

#' calculates the minimum NDR for each score 
#' @noRd
calculateNDR = function(score, labels){
    scoreOrder = order(score, decreasing = TRUE) 
    labels = labels[scoreOrder]
    NDR = cumsum(!labels)/(seq_len(length(labels))) #calculate NDR
    NDR = rev(cummin(rev(NDR))) #flatten NDR so its never higher than a lower ranked RC
    return(NDR[order(scoreOrder)]) #return to original order
}

#' generate exon/intron ByReadClass objects
#' @importFrom GenomicRanges makeGRangesListFromFeatureFragments
#' @importFrom GenomeInfoDb seqlevels seqlevels<-
#' @importFrom dplyr select distinct 
#' @noRd
makeExonsIntronsSpliced <- function(transcriptsTibble,annotationSeqLevels){
  if(all(is.na(transcriptsTibble$intronStarts))){
      intronsByReadClass = GRangesList()}
  else {    intronsByReadClass <- makeGRangesListFromFeatureFragments(
        seqnames = transcriptsTibble$chr,
        fragmentStarts = transcriptsTibble$intronStarts,
        fragmentEnds = transcriptsTibble$intronEnds,
        strand = transcriptsTibble$strand)
  }
  names(intronsByReadClass) <- seq_along(intronsByReadClass)
  seqlevels(intronsByReadClass) <-
    unique(c(seqlevels(intronsByReadClass), 
             annotationSeqLevels))
  exonsByReadClass <- createExonByReadClass(
    transcriptsTibble, annotationSeqLevels)
  return(list("exons" = exonsByReadClass, 
              "introns" = intronsByReadClass))
}


#' create exonsByReadClass
#' @param seFilteredSpliced a SummarizedExperiment object
#' for filtered spliced reads
#' @param annotationGrangesList annotation GRangesList object
#' @importFrom GenomicRanges makeGRangesListFromFeatureFragments unlist narrow
#'     elementNROWS relist
#' @importFrom GenomeInfoDb seqlevels
#' @noRd
createExonByReadClass <- function(transcriptsTibble, annotationSeqLevels) {
  exonEndsShifted <- paste(transcriptsTibble$intronStarts,
                           as.integer(transcriptsTibble$end + 1), sep = ",")
  exonStartsShifted <- paste(as.integer(transcriptsTibble$start - 1),
                             transcriptsTibble$intronEnds, sep = ",")
  exonsByReadClass <- makeGRangesListFromFeatureFragments(
    seqnames = transcriptsTibble$chr,
    fragmentStarts = exonStartsShifted,
    fragmentEnds = exonEndsShifted,
    strand = transcriptsTibble$strand)
  exonsByReadClass <- narrow(exonsByReadClass, start = 2, end = -2)
  # correct junction to exon differences in coordinates
  names(exonsByReadClass) <- seq_along(exonsByReadClass)
  # add exon start and exon end rank
  unlistData <- unlist(exonsByReadClass, use.names = FALSE)
  partitioning <- PartitioningByEnd(cumsum(elementNROWS(exonsByReadClass)),
                                    names = NULL)
  exon_rank <- lapply(width((partitioning)), seq, from = 1)
  # * assumes positive for exon ranking
  negative_strand <- which(transcriptsTibble$strand == "-")
  exon_rank[negative_strand] <- lapply(exon_rank[negative_strand], rev) 
  exon_endRank <- lapply(exon_rank, rev)
  unlistData$exon_rank <- unlist(exon_rank)
  unlistData$exon_endRank <- unlist(exon_endRank)
  exonsByReadClass <- relist(unlistData, partitioning)
  seqlevels(exonsByReadClass) <-
    unique(c(seqlevels(exonsByReadClass), annotationSeqLevels))
  return(exonsByReadClass)
}



#' extended annotations for spliced reads
#' @noRd
addNewSplicedReadClasses <- function(combinedTranscriptRanges, 
                                     rowDataFilteredSpliced, annotationGrangesList, min.exonDistance, 
                                     min.primarySecondaryDist, min.primarySecondaryDistStartEnd, verbose){
  start.ptm <- proc.time()
  exonsByReadClass <- combinedTranscriptRanges$exons
  intronsByReadClass <- combinedTranscriptRanges$introns
  ovExon <- 
    findSpliceOverlapsQuick(cutStartEndFromGrangesList(exonsByReadClass),
                            cutStartEndFromGrangesList(annotationGrangesList)) # slow
  classificationTable <- 
    data.frame(matrix("", nrow = length(exonsByReadClass), ncol = 9), 
               stringsAsFactors = FALSE)
  colnames(classificationTable) <- c("equal", "compatible", "newWithin",
                                     "newLastJunction","newFirstJunction","newJunction", "allNew", 
                                     "newFirstExon", "newLastExon")
  equalQhits <- queryHits(ovExon[mcols(ovExon)$equal])
  classificationTable$equal[equalQhits[!duplicated(equalQhits)]] <- "equal"
  compatibleQhits <- queryHits(ovExon[mcols(ovExon)$compatible])
  classificationTable$compatible[
    compatibleQhits[!duplicated(compatibleQhits)]] <- "compatible"
  classificationTable$compatible[classificationTable$equal == "equal"] <- ""
  # annotate with transcript and gene Ids
  equalSubHits <- subjectHits(ovExon[mcols(ovExon)$equal])
  rowDataFilteredSpliced$TXNAME <- NA
  rowDataFilteredSpliced$TXNAME[
    equalQhits[!duplicated(equalQhits)]] <-
    mcols(annotationGrangesList)$TXNAME[
      equalSubHits[!duplicated(equalQhits)]]
  compatibleSubHits <- subjectHits(ovExon[mcols(ovExon)$compatible])
  rowDataFilteredSpliced$GENEID <- NA
  rowDataFilteredSpliced$GENEID[
    compatibleQhits[!duplicated(compatibleQhits)]] <-
    mcols(annotationGrangesList)$GENEID[
      compatibleSubHits[!duplicated(compatibleQhits)]]
  # annotate with compatible gene id,
  rowDataFilteredSpliced$GENEID[equalQhits[!duplicated(equalQhits)]] <-
    mcols(annotationGrangesList[equalSubHits[!duplicated(equalQhits)]])$GENEID
  # annotate as identical, using intron matches
  unlistedIntrons <- unlist(intronsByReadClass, use.names = TRUE)
  partitioning <- PartitioningByEnd(cumsum(elementNROWS(intronsByReadClass)),
                                    names = NULL)
  unlistedIntronsAnnotations <- unlist(myGaps(annotationGrangesList))
  mcols(unlistedIntronsAnnotations)$GENEID <- mcols(annotationGrangesList
  )$GENEID[match(names(unlistedIntronsAnnotations),
                 mcols(annotationGrangesList)$TXNAME)]
  classificationTable <- 
    updateWIntronMatches(unlistedIntrons, unlistedIntronsAnnotations,
                         partitioning, classificationTable, annotationGrangesList,
                         rowDataFilteredSpliced, exonsByReadClass, min.exonDistance,
                         min.primarySecondaryDist, min.primarySecondaryDistStartEnd)             
  rowDataFilteredSpliced$readClassType <-
    apply(classificationTable, 1, function(x){paste(x[x!=""], collapse = ":")})
  rowDataFilteredSpliced$novelTranscript = TRUE
  rowDataFilteredSpliced$novelTranscript[classificationTable$equal=="equal"] = FALSE
  end.ptm <- proc.time()
  if (verbose) message("extended annotations for spliced reads in ",
                       round((end.ptm - start.ptm)[3] / 60, 1), " mins.")
  return(rowDataFilteredSpliced)
}


#' update classificationTable
#' @importFrom GenomicRanges match
#' @noRd
updateWIntronMatches <- function(unlistedIntrons, unlistedIntronsAnnotations,
                                partitioning, classificationTable, annotationGrangesList,
                                rowDataFilteredSpliced, exonsByReadClass, min.exonDistance,
                                min.primarySecondaryDist, min.primarySecondaryDistStartEnd){
  intronMatches <- match(
    unlistedIntrons,unique(unlistedIntronsAnnotations),nomatch = 0) > 0
  intronMatchesList <- relist(intronMatches, partitioning)
  classificationTable$newWithin[all(intronMatchesList) & !(
    classificationTable$compatible == "compatible" |
      classificationTable$equal == "equal")] <- "newWithin"
  intronMatchListRev <- lapply(intronMatchesList, rev)
  lastJunctionMatch <- unlist(lapply(intronMatchListRev,`[[`, 1))
  firstJunctionMatch <- unlist(lapply(intronMatchesList, `[[`, 1))
  strandVec <- rowDataFilteredSpliced$strand
  intronMatchAny <- any(intronMatchesList)
  classificationTable$newLastJunction[which( strandVec == "+" & !lastJunctionMatch &
                                               intronMatchAny | (strandVec == "-" & (!firstJunctionMatch &
                                                                                       intronMatchAny)))] <- "newLastJunction"
  classificationTable$newFirstJunction[which(strandVec == "+" & !firstJunctionMatch &
                                               intronMatchAny | (strandVec == "-" & (!lastJunctionMatch &
                                                                                       intronMatchAny)))] <- "newFirstJunction"
  classificationTable$newJunction[(
    sum(!intronMatchesList) > !firstJunctionMatch + !lastJunctionMatch) &
      intronMatchAny] <- "newJunction"
  classificationTable$allNew[!intronMatchAny] <- "allNew"
  ## assign gene ids based on the max # of matching introns/splice junctions
  overlapsNewIntronsAnnotatedIntrons <- findOverlaps(unlistedIntrons,
                                                     unlistedIntronsAnnotations, type = "equal",
                                                     select = "all", ignore.strand = FALSE)
  if (length(overlapsNewIntronsAnnotatedIntrons)) {
    distNewTxByQuery <- assignGeneIDbyMaxMatch(
      unlistedIntrons,unlistedIntronsAnnotations,
      overlapsNewIntronsAnnotatedIntrons, exonsByReadClass,
      rowDataFilteredSpliced, annotationGrangesList, min.exonDistance,
      min.primarySecondaryDist, min.primarySecondaryDistStartEnd)
    classificationTable$compatible[distNewTxByQuery$queryHits[
      distNewTxByQuery$compatible]] <- "compatible"
    classificationTable$newFirstExon[distNewTxByQuery$queryHits[
      !distNewTxByQuery$startMatch]] <- "newFirstExon"
    classificationTable$newFirstExon[
      classificationTable$newFirstJunction != "newFirstJunction"] <- ""
    classificationTable$newLastExon[distNewTxByQuery$queryHits[
      !distNewTxByQuery$endMatch]] <- "newLastExon"
    classificationTable$newLastExon[
      classificationTable$newLastJunction != "newLastJunction"] <- ""
  }
  return(classificationTable)
}



#' assign gene id by maximum match
#' @importFrom dplyr as_tibble %>% group_by summarise filter ungroup
#' @noRd
assignGeneIDbyMaxMatch <- function(unlistedIntrons,
                                   unlistedIntronsAnnotations, overlapsNewIntronsAnnotatedIntrons,
                                   exonsByReadClass, rowDataFilteredSpliced, annotationGrangesList,
                                   min.exonDistance, min.primarySecondaryDist,
                                   min.primarySecondaryDistStartEnd) {
  maxGeneCountPerNewTx <- as_tibble(data.frame(txId =
                                                 names(unlistedIntrons)[queryHits(overlapsNewIntronsAnnotatedIntrons)],
                                               geneId = mcols(unlistedIntronsAnnotations)$GENEID[
                                                 subjectHits(overlapsNewIntronsAnnotatedIntrons)],
                                               stringsAsFactors = FALSE)) %>%
    group_by(txId, geneId) %>%
    summarise(geneCount = n()) %>%
    group_by(txId) %>%
    filter(geneCount == max(geneCount)) %>%
    filter(!duplicated(txId)) %>%
    ungroup()
  geneIdByIntron <- rep(NA, length(exonsByReadClass))
  geneIdByIntron <- maxGeneCountPerNewTx$geneId[
    match(names(exonsByReadClass), maxGeneCountPerNewTx$txId)]
  rowDataFilteredSpliced$GENEID[is.na(rowDataFilteredSpliced$GENEID)] <-
    geneIdByIntron[is.na(rowDataFilteredSpliced$GENEID)]
  distNewTx <- calculateDistToAnnotation(
    exByTx = exonsByReadClass,
    exByTxRef = annotationGrangesList,
    maxDist = min.exonDistance,
    primarySecondaryDist = min.primarySecondaryDist,
    primarySecondaryDistStartEnd = min.primarySecondaryDistStartEnd,
    ignore.strand = FALSE)
  distNewTxByQuery <- distNewTx %>%
    group_by(queryHits) %>%
    summarise(
      minDist = min(dist),
      startMatch = any(startMatch),
      endMatch = any(endMatch),
      compatible = any(compatible))
  ## note: here is more information that can be used to filter and annotate!
  return(distNewTxByQuery)
}


#' Calculate distance from read class to annotation
#' @param exByTx exByTx
#' @param exByTxRef exByTxRef
#' @param maxDist defaults to 35
#' @param primarySecondaryDist defaults to 5
#' @param ignore.strand defaults to FALSE
#' @importFrom dplyr ungroup %>%
#' @noRd
calculateDistToAnnotation <- function(exByTx, exByTxRef, maxDist = 35,
                                      primarySecondaryDist = 5, primarySecondaryDistStartEnd = 5,
                                      ignore.strand = FALSE) {
  # (1)  find overlaps of read classes with annotated transcripts,
  spliceOverlaps <- findSpliceOverlapsByDist(exByTx, exByTxRef,
                                             maxDist = maxDist, firstLastSeparate = TRUE,
                                             dropRangesByMinLength = TRUE, cutStartEnd = TRUE,
                                             ignore.strand = ignore.strand)
  txToAnTableFiltered <- genFilteredAnTable(spliceOverlaps,
      primarySecondaryDist, primarySecondaryDistStartEnd, DistCalculated = FALSE)
  # (2) calculate splice overlap for any not in the list (new exon >= 35bp)
  setTMP <- unique(txToAnTableFiltered$queryHits)

  spliceOverlaps_rest <- findSpliceOverlapsByDist(exByTx[-setTMP],
                                                    exByTxRef, maxDist = 0, type = "any", firstLastSeparate = TRUE,
                                                    dropRangesByMinLength = FALSE, cutStartEnd = TRUE,
                                                    ignore.strand = ignore.strand)
  if(length(spliceOverlaps_rest) > 0){
    txToAnTableRest <-
      genFilteredAnTable(spliceOverlaps_rest, primarySecondaryDist,primarySecondaryDistStartEnd,
                        exByTx = exByTx, setTMP = setTMP, DistCalculated = FALSE)
    # (3) find overlaps for remaining reads 
    setTMPRest <- unique(c(txToAnTableRest$queryHits, setTMP))
    txToAnTableRestStartEnd <- NULL
    if (length(exByTx[-setTMPRest])) {
      spliceOverlaps_restStartEnd <-
        findSpliceOverlapsByDist(exByTx[-setTMPRest], exByTxRef,
                                maxDist = 0, type = "any", firstLastSeparate = TRUE,
                                dropRangesByMinLength = FALSE,
                                cutStartEnd = FALSE, ignore.strand = ignore.strand)
      if (length(spliceOverlaps_restStartEnd)) {
        txToAnTableRestStartEnd <-
          genFilteredAnTable(spliceOverlaps_restStartEnd,
                            primarySecondaryDist, exByTx = exByTx,
                            setTMP = setTMPRest, DistCalculated = TRUE)
      }
    }
    txToAnTableFiltered <- rbind( txToAnTableFiltered,
                                  txToAnTableRest, txToAnTableRestStartEnd )
  } 
  txToAnTableFiltered <- txToAnTableFiltered %>% ungroup()
  txToAnTableFiltered$readClassId <-
    names(exByTx)[txToAnTableFiltered$queryHits]
  txToAnTableFiltered$annotationTxId <-
    names(exByTxRef)[txToAnTableFiltered$subjectHits]
  txToAnTableFiltered$txid <-
    mcols(exByTxRef)$txid[txToAnTableFiltered$subjectHits]
  return(txToAnTableFiltered)
}


#' generate filtered annotation table
#' @param spliceOverlaps an output from  findSpliceOverlapsByDist()
#' @param primarySecondaryDist default 5
#' @param primarySecondaryDistStartEnd default 5
#' @param exByTx default NULL
#' @param setTMP default NULL
#' @param DistCalculated default FALSE
#' @importFrom dplyr as_tibble group_by %>% mutate n arrange filter arrange
#' @noRd
genFilteredAnTable <- function(spliceOverlaps, primarySecondaryDist = 5,
                               primarySecondaryDistStartEnd = 5, exByTx = NULL, setTMP = NULL,
                               DistCalculated = FALSE) {
  ## initiate the table
  if (isFALSE(DistCalculated)) {
    txToAnTable <- as_tibble(spliceOverlaps) %>% group_by(queryHits) %>%
      mutate(dist = uniqueLengthQuery + uniqueLengthSubject) %>%
      mutate(txNumber = n())
  } else {
    txToAnTable <- as_tibble(spliceOverlaps) %>% group_by(queryHits) %>%
      mutate(dist = uniqueLengthQuery + uniqueLengthSubject +
               uniqueStartLengthQuery + uniqueEndLengthQuery) %>%
      mutate(txNumber = n())
  }
  ## change query hits for step 2 and 3
  if (!is.null(exByTx)) {
    txToAnTable$queryHits <-
      (seq_along(exByTx))[-setTMP][txToAnTable$queryHits]
  }
  ## todo: check filters, what happens to reads with only start and end match?
  if (isFALSE(DistCalculated)) {
    txToAnTableFiltered <- txToAnTable %>%
      group_by(queryHits) %>%
      arrange(queryHits, dist) %>%
      filter(dist <= (min(dist) + primarySecondaryDist)) %>%
      filter(queryElementsOutsideMaxDist + 
               subjectElementsOutsideMaxDist == 
               min(queryElementsOutsideMaxDist +
                     subjectElementsOutsideMaxDist)) %>% 
      filter((uniqueStartLengthQuery <= primarySecondaryDistStartEnd &
                uniqueEndLengthQuery <= primarySecondaryDistStartEnd) ==
               max(uniqueStartLengthQuery <=
                     primarySecondaryDistStartEnd & uniqueEndLengthQuery <=
                     primarySecondaryDistStartEnd)) %>%
      mutate(txNumberFiltered = n())
  } else {
    txToAnTableFiltered <- txToAnTable %>%
      group_by(queryHits) %>%
      arrange(queryHits, dist) %>%
      filter(dist <= (min(dist) + primarySecondaryDist)) %>%
      mutate(txNumberFiltered = n())
  }
  return(txToAnTableFiltered)
}

#' extended annotations for unspliced reads
#' @param se a summarized experient object
#' @param seFilteredSpliced seFilteredSpliced
#' @param exonsByReadClass exonsByReadClass
#' @param annotationGrangesList annotationGrangesList
#' @param filterSet1 filterSet1
#' @param min.exonOverlap min.exonOverlap
#' @param verbose verbose
#' @importFrom GenomicRanges GRanges
#' @importFrom GenomeInfoDb seqlevels seqlevels<-
#' @importFrom SummarizedExperiment rbind
#' @noRd
addNewUnsplicedReadClasses <- function(rowDataFilteredUnspliced,  rowDataFilteredSpliced,
                                       exonsByReadClass, annotationGrangesList, min.exonOverlap, verbose) {
  start.ptm <- proc.time()
  rowDataCombined <- rowDataFilteredSpliced
  exonRangesCombined <- exonsByReadClass
  names(exonRangesCombined) <- seq_along(exonRangesCombined)
  if (nrow(rowDataFilteredUnspliced)) {
    exonsByReadClassUnspliced <- GRanges(
      seqnames = rowDataFilteredUnspliced$chr,
      ranges = IRanges(start = rowDataFilteredUnspliced$start,
                       end = rowDataFilteredUnspliced$end),
      strand = rowDataFilteredUnspliced$strand)
    partitioning <- PartitioningByEnd(seq_along(exonsByReadClassUnspliced),
                                      names = NULL)
    exonsByReadClassUnspliced$exon_rank <-
      rep(1, length(exonsByReadClassUnspliced))
    exonsByReadClassUnspliced$exon_endRank <-
      rep(1, length(exonsByReadClassUnspliced))
    exonsByReadClassUnspliced <-
      relist(exonsByReadClassUnspliced, partitioning)
    seqlevels(exonsByReadClassUnspliced) <- 
      unique(c(seqlevels(exonsByReadClassUnspliced),
               seqlevels(annotationGrangesList)))
    rowDataFilteredUnspliced$GENEID <- NA
    rowDataFilteredUnspliced$TXNAME <- NA
    rowDataFilteredUnspliced$readClassType <- "unsplicedNew"
    rowDataFilteredUnspliced$novelGene <- TRUE
    rowDataFilteredUnspliced$novelTranscript <- TRUE
    ## add filter to remove unspliced transcripts which overlap
    ## with known transcripts/high quality spliced transcripts
    overlapUnspliced <- findOverlaps(exonsByReadClassUnspliced,
                                     annotationGrangesList, minoverlap = min.exonOverlap,
                                     select = "first")
    naOverlapUnspliced <- which(is.na(overlapUnspliced))
    if(length(naOverlapUnspliced)) {
      rowDataFilteredUnspliced <- 
        rowDataFilteredUnspliced[naOverlapUnspliced,]
      exonsByReadClassUnspliced <-
        exonsByReadClassUnspliced[naOverlapUnspliced]
      ## combined spliced and unspliced Tx candidates
      rowDataCombined <- 
        bind_rows(rowDataFilteredSpliced, rowDataFilteredUnspliced)
      exonRangesCombined <- c(exonsByReadClass, exonsByReadClassUnspliced)
      names(exonRangesCombined) <- seq_along(exonRangesCombined)
    }
  } 
  end.ptm <- proc.time()
  if (verbose) message("extended annotations for unspliced reads in ",
                       round((end.ptm - start.ptm)[3] / 60, 1), " mins.")
  return(list("rowDataCombined" = rowDataCombined, 
              "exonRangesCombined" = exonRangesCombined))
}

#' calculate distance between first and last exon matches
#' @param candidateList candidateList
#' @param filteredOverlapList filteredOverlapList
#' @importFrom dplyr select rename %>% left_join group_by filter 
#'     ungroup distinct
#' @noRd
includeOverlapReadClass <- function(candidateList, filteredOverlapList) {
  temp <- left_join(candidateList, filteredOverlapList,
                    by = c("subjectHits" = "queryHits")) %>%
    group_by(queryHits) %>%
    filter(!subjectHits.y %in% subjectHits, !is.na(subjectHits.y)) %>%
    ungroup() %>%
    dplyr::select(queryHits, subjectHits.y) %>%
    distinct() %>%
    rename(subjectHits = subjectHits.y)
  return(temp)
}

#' combine annotations with predicted transcripts
#' @noRd
combineWithAnnotations <- function(rowDataCombinedFiltered, 
                                        exonRangesCombinedFiltered,annotationGrangesList, prefix){
    rowDataCombinedFiltered$txClassDescription <- rowDataCombinedFiltered$readClassType
    rowDataCombinedFiltered$txClassDescription[rowDataCombinedFiltered$readClassType
                                       == "unsplicedNew" & rowDataCombinedFiltered$novelGene] <-
      "newGene-unspliced"
    rowDataCombinedFiltered$txClassDescription[rowDataCombinedFiltered$readClassType
                                       == "allNew" & rowDataCombinedFiltered$novelGene] <-
      "newGene-spliced"
    extendedAnnotationRanges <- exonRangesCombinedFiltered
    mcols(extendedAnnotationRanges) <-
      rowDataCombinedFiltered[, c("GENEID", "novelGene", "novelTranscript", "txClassDescription","readCount", "NDR",
                                  "relReadCount", "relSubsetCount")]
    equalRanges = rowDataCombinedFiltered[!is.na(rowDataCombinedFiltered$TXNAME),]
    #remove extended ranges that are already present in annotation
    extendedAnnotationRanges <- extendedAnnotationRanges[is.na(rowDataCombinedFiltered$TXNAME)]
    annotationRangesToMerge <- annotationGrangesList
    if(length(annotationGrangesList)){
        mcols(annotationRangesToMerge)$readCount <- NA
        mcols(annotationRangesToMerge)$txClassDescription <- "annotation"
        mcols(annotationRangesToMerge)$novelTranscript <- FALSE
        mcols(annotationRangesToMerge)$novelGene <- FALSE
        mcols(annotationRangesToMerge)$NDR <- NA
        mcols(extendedAnnotationRanges) <- mcols(extendedAnnotationRanges)[,colnames(mcols(extendedAnnotationRanges))]
        #copy over stats to annotations from read classes
        mcols(annotationRangesToMerge[equalRanges$TXNAME])$NDR = equalRanges$NDR
        mcols(annotationRangesToMerge[equalRanges$TXNAME])$readCount = equalRanges$readCount
        mcols(annotationRangesToMerge[equalRanges$TXNAME])$relReadCount = equalRanges$relReadCount
        mcols(annotationRangesToMerge[equalRanges$TXNAME])$relSubsetCount = equalRanges$relSubsetCount
    }
    if (length(extendedAnnotationRanges)) {
      mcols(extendedAnnotationRanges)$TXNAME <- paste0(
      prefix, "Tx", seq_along(extendedAnnotationRanges))
    names(extendedAnnotationRanges) <- mcols(extendedAnnotationRanges)$TXNAME
    extendedAnnotationRanges <-
      c(extendedAnnotationRanges, annotationRangesToMerge) # this will throw error in line 648-649 when extendedAnnotationRanges is empty 
    mcols(extendedAnnotationRanges)$txid <- seq_along(extendedAnnotationRanges)
    }else{
      extendedAnnotationRanges <- annotationRangesToMerge
      mcols(extendedAnnotationRanges)$txid <- seq_along(extendedAnnotationRanges)
      mcols(extendedAnnotationRanges)$relReadCount = NA
      mcols(extendedAnnotationRanges)$relSubsetCount = NA
    }
  minEqClasses <-
    getMinimumEqClassByTx(extendedAnnotationRanges) # get eqClasses
  if(!identical(names(extendedAnnotationRanges),minEqClasses$queryTxId)) warning('eq classes might be incorrect')
  mcols(extendedAnnotationRanges)$eqClassById <- minEqClasses$eqClassById
  mcols(extendedAnnotationRanges) <- mcols(extendedAnnotationRanges)[, 
                 c("TXNAME", "GENEID", "NDR", "novelGene", "novelTranscript", "txClassDescription","readCount","relReadCount", "relSubsetCount", "txid", "eqClassById")]
  return(extendedAnnotationRanges)
}


#' Estimate distance between read class and annotations
#' @param seReadClass seReadClass
#' @inheritParams bambu
#' @importFrom dplyr select left_join as_tibble mutate %>%
#' @noRd
isore.estimateDistanceToAnnotations <- function(seReadClass,
                                                annotationGrangesList, min.exonDistance = 35,
                                                min.primarySecondaryDist = 5, min.primarySecondaryDistStartEnd = 100000, 
                                                additionalFiltering = FALSE, verbose = FALSE) {
  start.ptm <- proc.time()
  readClassTable <-
    as_tibble(rowData(seReadClass), rownames = "readClassId") %>%
    dplyr::select(readClassId, confidenceType)
  distTable <- calculateDistToAnnotation(rowRanges(seReadClass),
                                         annotationGrangesList, maxDist = min.exonDistance,
                                         primarySecondaryDist = min.primarySecondaryDist,
                                         primarySecondaryDistStartEnd = min.primarySecondaryDistStartEnd,
                                         ignore.strand = FALSE)
  distTable$readCount <- assays(seReadClass)$counts[distTable$readClassId, ] 
  if (additionalFiltering) 
    distTable <- left_join(distTable, select(readClassTable,
                                             readClassId, confidenceType), by = "readClassId") %>%
    mutate(relativeReadCount = readCount / txNumberFiltered)
  distTable <- dplyr::select(distTable, annotationTxId, txid, readClassId,
      readCount, compatible, equal,dist)
  distTable <- left_join(distTable, as_tibble(mcols(annotationGrangesList)[, c("txid", "GENEID")]),
      by = c("txid" = "txid"))
  
  end.ptm <- proc.time()
  if (verbose) message("calculated distance table in ",
                       round((end.ptm - start.ptm)[3] / 60, 1), " mins.")
  readClassTable <- addGeneIdsToReadClassTable(readClassTable, distTable, 
                                               seReadClass, verbose)
  distTable <- DataFrame(distTable)
  distTable$eqClassById <- as.list(mcols(annotationGrangesList)$eqClassById)[distTable$txid]
  
  metadata(seReadClass) <- list(distTable = distTable)
  rowData(seReadClass) <- readClassTable
  return(seReadClass)
}



#' generate readClassTable
#' @importFrom dplyr select filter distinct unlist group_by mutate %>% ungroup
#'     left_join
#' @noRd
addGeneIdsToReadClassTable <- function(readClassTable, distTable, 
                                       seReadClass, verbose){
  readClassTable$equal <- readClassTable$readClassId %in%
    unlist((filter(distTable, equal) %>%
              dplyr::select(readClassId) %>% distinct()))
  readClassTable$compatible <- readClassTable$readClassId %in%
    unlist((filter(distTable, compatible) %>% distinct()))
  start.ptm <- proc.time()
  # assign read classes to genes based on the highest read count per gene
  readClassToGeneIdTable <- select(distTable, readClassId, GENEID,
                                   readCount) %>% group_by(GENEID) %>%
    mutate(geneCount = sum(readCount)) %>% distinct() %>%
    group_by(readClassId) %>% filter(geneCount == max(geneCount)) %>%
    filter(row_number() == 1) %>% 
    dplyr::select(readClassId, geneId = GENEID) %>% ungroup()
  newGeneCandidates <- (!readClassTable$readClassId %in%
                          readClassToGeneIdTable$readClassId)
  readClassToGeneIdTableNew <- 
      assignGeneIdsNoReference(rowRanges(seReadClass)[newGeneCandidates], 
      prefix = "unassigned")
  readClassToGeneIdTableNew <- tibble(names(seReadClass)[newGeneCandidates], 
      readClassToGeneIdTableNew)
  colnames(readClassToGeneIdTableNew) <- c('readClassId', 'geneId')
  readClassGeneTable <- rbind(readClassToGeneIdTable, 
                              readClassToGeneIdTableNew)
  readClassTable <- left_join(readClassTable, readClassGeneTable,
                              by = "readClassId") %>%
    dplyr::select(confidenceType, geneId, compatible, equal)
  end.ptm <- proc.time()
  if (verbose) message("added gene Ids for each read class ",
                       round((end.ptm - start.ptm)[3] / 60, 1), " mins.")
  return(readClassTable)
}
GoekeLab/bambu documentation built on April 6, 2024, 10:36 p.m.