R/mergeHits.R

Defines functions mergeHits

Documented in mergeHits

#' Function to merge adjacent events explainable by the same donor pseudogene
#'
#' @param hits data.frame storing annotations of the nominated hits (donor pseudogenes) that explain all gene conversion event generated by running the BLAT program.
#' @param lut_functional an \code{IRanges} object storing positions of mismatches between all functional alleles against all pseudogene alleles. This acts as a lookup table to determine whether the functional allele and any given pseudogene share identical nucleotide sequences within a range of interest.
#'
#' @description This function merges adjacent events only if the intervening sequence stretch is COMPLETELY identical between the functional allele and the nominated donor pseudogene. As a result of the merge, the boundaries, edit distance and other annotations of the events are adjusted accordingly.
#'
#' @import plyr
mergeHits <- function(hits, lut_functional)
{
  # merge consecutive hits on the same pseudogene
  n = 1
  hits$event <- n
  genes <- lapply(hits$gene, function(z) {
    if(!is.na(z)) unlist(strsplit(z, split = ";")) else z
  })
  # only one event
  if(nrow(hits) == 1) {
    if( is.na(hits[1, "gene"]) ){
      return(data.frame(#event = 1, possibility = "a",
        start = hits[1, "start"],
        end = hits[1, "end"], gene = NA,
        fiveprime_identical_length = NA,
        threeprime_identical_length = NA,
        edit_distance = NA,
        start_ungapped = hits[1, "start_ungapped"],
        end_ungapped = hits[1, "end_ungapped"],
        stringsAsFactors = FALSE
      ))
    } else {
      edits <- unlist(strsplit(hits[1, "edit_distance"], split = ";"))
      fp <- unlist(strsplit(hits[1, "fiveprime_identical_length"], split = ";"))
      tp <- unlist(strsplit(hits[1, "threeprime_identical_length"], split = ";"))
      names(edits) <- genes[[1]]
      names(fp) <- genes[[1]]
      names(tp) <- genes[[1]]
      edits <- edits[ which(edits == min(edits, na.rm = TRUE)) ]
      fp <- fp[ names(edits) ]
      tp <- tp[ names(edits) ]
      if(length(fp) == 0) {
        hits <- data.frame(start = hits[1, "start"], end = hits[1, "end"],
                           gene = NA, fiveprime_identical_length = NA,
                           threeprime_identical_length = NA,
                           edit_distance = NA, event = 1,
                           start_ungapped = hits[1, "start_ungapped"],
                           end_ungapped = hits[1, "end_ungapped"],
                           stringsAsFactors = FALSE)
      } else {
        hits <- data.frame(start = hits[1, "start"], end = hits[1, "end"],
                           gene = names(fp), fiveprime_identical_length = fp,
                           threeprime_identical_length = tp,
                           edit_distance = edits, event = 1,
                           start_ungapped = hits[1, "start_ungapped"],
                           end_ungapped = hits[1, "end_ungapped"],
                           stringsAsFactors = FALSE)
      }
      hits <- do.call("ddply",
                      list(hits,
                           c("event", "fiveprime_identical_length", "threeprime_identical_length", "edit_distance"),
                           summarise,
                           start = call("min", as.symbol("start")), end = call("max", as.symbol("end")),
                           start_ungapped = call("min", as.symbol("start_ungapped")),
                           end_ungapped = call("max", as.symbol("end_ungapped")),
                           gene = call("paste", as.symbol("gene"), collapse = ";")))
      return( hits[, c("start", "end", "gene",
                       "fiveprime_identical_length", "threeprime_identical_length",
                       "edit_distance", "start_ungapped", "end_ungapped")] )
    }
  }
  # more than one event
  for(z in 2:nrow(hits)){
    matches <- match(genes[[z]],
                     try( unlist(strsplit(hits[z-1, "gene"], split = ";")),
                          silent = TRUE) )
    if(length(which(!is.na(matches))) > 0){
      # count mismatches (against germline) between hits; if 0, merge; otherwise leave it
      o <- sapply(genes[[z]][ which(!is.na(matches)) ], function(g){
        spacer_start <- as.numeric( unlist(strsplit(hits[z-1, "q_end"], split = ";")) ) + 1
        names(spacer_start) <- genes[[z-1]]
        spacer_end <- as.numeric( unlist(strsplit(hits[z, "q_start"], split = ";")) ) - 1
        names(spacer_end) <- genes[[z]]
        stretch <- try( IRanges::IRanges(start = spacer_start[g], end = spacer_end[g]), silent = TRUE )
        if( class(stretch) == "try-error" ) return(1000) # just an arbitrary large number
        else{
          m <- sum(
            IRanges::width(
              IRanges::intersect(stretch,
                                 lut_functional[ which(names(lut_functional) == g) ])
            ))
          m
        }
      })
      names(o) <- genes[[z]][ which(!is.na(matches)) ]
      o <- o[o == 0]
      if(length(o) > 0) {
        # sum over edit distances
        e_z1 <- as.numeric( unlist(strsplit(hits[z-1, "edit_distance"], split = ";")) )
        names(e_z1) <- unlist(strsplit(hits[z-1, "gene"], split = ";"))
        e_z <- as.numeric( unlist(strsplit(hits[z, "edit_distance"], split = ";")) )
        names(e_z) <- genes[[z]]
        e_z1 <- e_z1[names(o)]; e_z <- e_z[names(o)]
        edits <- sapply(names(o), function(g) e_z1[g] + e_z[g])
        names(edits) <- names(o)
        # select the gene with min edit distance
        edits <- edits[ which(edits == min(edits, na.rm = TRUE)) ]
        hits[z, "edit_distance"] <- paste(edits, collapse = ";")
        hits[z-1, "edit_distance"] <- paste(edits, collapse = ";")
        # copy over fiveprime identity stretch length
        fp <- unlist(strsplit(hits[z-1, "fiveprime_identical_length"], split = ";"))
        names(fp) <- unlist(strsplit(hits[z-1, "gene"], split = ";"))
        hits[z, "fiveprime_identical_length"] <- paste(fp[names(edits)], collapse = ";")
        hits[z-1, "fiveprime_identical_length"] <- paste(fp[names(edits)], collapse = ";")
        # copy over threeprime identity stretch length
        tp <- unlist(strsplit(hits[z, "threeprime_identical_length"], split = ";"))
        names(tp) <- unlist(strsplit(hits[z, "gene"], split = ";"))
        hits[z, "threeprime_identical_length"] <- paste(tp[names(edits)], collapse = ";")
        hits[z-1, "threeprime_identical_length"] <- paste(tp[names(edits)], collapse = ";")
        o <- paste(names(edits), collapse = ";")
        hits[z, "gene"] <- o; hits[z-1, "gene"] <- o
        hits[z, "event"] <- hits[z-1, "event"]
        hits[z, "start"] <- hits[z-1, "start"]
        hits[z, "start_ungapped"] <- hits[z-1, "start_ungapped"]
        hits[z-1, "end"] <- hits[z, "end"]
        hits[z-1, "end_ungapped"] <- hits[z, "end_ungapped"]
      } else {
        n <- n + 1
        hits[z, "event"] <- n
      }
    }
    else {
      if( hits[z, "start"] != hits[z-1, "start"] ){
        n <- n + 1
      }
      hits[z, "event"] <- n
    }
  }
  hits <- hits[, c("start", "end", "gene", "fiveprime_identical_length",
                   "threeprime_identical_length" , "edit_distance", "event",
                   "start_ungapped", "end_ungapped")]
  hits <- unique(hits)
  to_delete <- c()
  if(nrow(hits) >= 2){
    for(i in 2:nrow(hits)){
      if( hits[i, "start"] == hits[i-1, "start"] & hits[i, "end"] != hits[i-1, "end"] ){
        to_delete <- c(to_delete, i-1)
      }
    }
  }
  if(length(to_delete) > 0) hits <- hits[-to_delete, ]
  # choose gene with min edit distance
  g <- apply(hits[, c("gene", "edit_distance", "fiveprime_identical_length", "threeprime_identical_length")], MARGIN = 1, function(x){
    if(is.na(x[1]) | is.na(x[2])) return( list("edit" = NA, "fp" = NA, "tp" = NA ) )
    else{
      genes <- unlist(strsplit(x[1], split = ";"))
      edits <- as.numeric( unlist(strsplit(x[2], split = ";")) )
      names(edits) <- genes
      fp <- unlist(strsplit(x[3], split = ";"))
      names(fp) <- genes
      edits <- edits[ which(edits == min(edits, na.rm = TRUE)) ]
      fp <- fp[ names(edits) ]
      tp <- unlist(strsplit(x[4], split = ";"))
      names(tp) <- genes
      edits <- edits[ which(edits == min(edits, na.rm = TRUE)) ]
      tp <- tp[ names(edits) ]
      list("edit" = edits, "fp" = fp, "tp" = tp)
    }
  })
  hits$gene <- sapply(g, function(h) paste(names(h$edit), collapse = ";"))
  hits$edit_distance <- sapply(g, function(h) paste(h$edit, collapse = ";"))
  hits$fiveprime_identical_length <- sapply(g, function(h) paste(h$fp, collapse = ";"))
  hits$threeprime_identical_length <- sapply(g, function(h) paste(h$tp, collapse = ";"))
  # deconvolute ";"-separated into separated rows
  hits <- split(hits, hits$event)
  hits <- do.call("rbind", lapply(hits, function(tb){
    if( is.na(tb[1, "gene"]) ){
      o <- data.frame(start = tb[1, "start"], end = tb[1, "end"],
                      gene = NA, fiveprime_identical_length = NA,
                      threeprime_identical_length = NA,
                      edit_distance = NA, event = tb[1, "event"],
                      start_ungapped = tb[1, "start_ungapped"],
                      end_ungapped = tb[1, "end_ungapped"],
                      stringsAsFactors = FALSE)
    } else if( tb[1, "gene"] == "" ){
      o <- data.frame(start = tb[1, "start"], end = tb[1, "end"],
                      gene = NA, fiveprime_identical_length = NA,
                      threeprime_identical_length = NA,
                      edit_distance = NA, event = tb[1, "event"],
                      start_ungapped = tb[1, "start_ungapped"],
                      end_ungapped = tb[1, "end_ungapped"],
                      stringsAsFactors = FALSE)
    } else {
      o <- data.frame(start = tb[1, "start"], end = tb[1, "end"],
                      gene = unlist(strsplit(tb[1, "gene"], split = ";")),
                      fiveprime_identical_length = unlist(strsplit(tb[1, "fiveprime_identical_length"], split = ";")),
                      threeprime_identical_length = unlist(strsplit(tb[1, "threeprime_identical_length"], split = ";")),
                      edit_distance = unlist(strsplit(tb[1, "edit_distance"], split = ";")),
                      event = tb[1, "event"],
                      start_ungapped = tb[1, "start_ungapped"],
                      end_ungapped = tb[1, "end_ungapped"],
                      stringsAsFactors = FALSE)
    }
    o
  }))
  hits <- do.call("ddply",
                  list(hits,
                       c("event", "fiveprime_identical_length", "threeprime_identical_length", "edit_distance"),
                       summarise,
                       start = call("min", as.symbol("start")), end = call("max", as.symbol("end")),
                       start_ungapped = call("min", as.symbol("start_ungapped")),
                       end_ungapped = call("max", as.symbol("end_ungapped")),
                       gene = call("paste", as.symbol("gene"), collapse = ";")))
  hits[, c("start", "end", "gene",
           "fiveprime_identical_length", "threeprime_identical_length",
           "edit_distance", "start_ungapped", "end_ungapped")]
}
Fraternalilab/BrepConvert documentation built on Oct. 14, 2022, 5:54 p.m.