R/remapper.r

Defines functions make_remapper

Documented in make_remapper

#' @rdname make_mapper
#' @export
make_remapper <- function(lookup) {
  ## global funcs
  detect  <- stringi::stri_detect_regex
  `%do%`  <- foreach::`%do%`
  memoise <- memoise::memoise

  expand_code <- function(x) {
    if (is_nmdp(x)) {
      remap_nmdp_code(x, ntbl4)
    } else if (is_g_code(x)) {
      remap_g_code(x, gtbl)
    } else x
  }

  ptn <- function(x) paste0("^(", paste0(x, collapse = "|"), ")(:.*)?$")

  get_partials <- function(ex3, prt) {
    anum1 <- ex3[, .(allele_num = allele_num[1]), by = eag_num]
    setkeyv(anum1, "allele_num")
    prt2 <- na.omit(prt[anum1, .(eag_allele, eag_num, allele_num), allow.cartesian = TRUE])
    setkeyv(prt2, "eag_num")
    prt2
  }

  crosscheck_nmdp_subtypes <- function(rep) {
    prt3.2 <- copy(prt3[detect(eag_allele, rep)])
    prt3.2[, allele_num := NULL]
    prt3.2[, eag_allele := vapply(hla_burst(eag_allele), function(x) hla_collapse(x[1:2]), "")]
    prt3.2 <- unique(prt3.2)
    Map(function(nums) enp[eag_num %in% nums][, prob := prob/sum(prob)],
        nums = split(prt3.2$eag_num, prt3.2$eag_allele))
  }

  crosscheck_eag_nums <- function(rep) {
    nums <- unique(prt3[detect(eag_allele, rep), eag_num])
    enp[eag_num %in% nums][, prob := prob/sum(prob)]
  }

  ## global objects
  ex2 <- exon(lookup, 2)[, .(eag_allele, eag_num, allele_num)]
  ex2 <- na.omit(ex2[!duplicated(ex2)])
  setkeyv(ex2, "eag_allele")

  ex3 <- exon(lookup, 3)[, .(eag_allele, eag_num, allele_num)]
  ex3 <- na.omit(ex3[!duplicated(ex3)])
  setkeyv(ex3, "eag_allele")

  jkr   <- lookup$jkr_tbl$eag_allele
  prt3  <- rbind(ex3, get_partials(ex3, prt = lookup$prt_tbl))
  gtbl  <- lookup$gtbl
  ntbl4 <- lookup$ntbl4
  lra   <- lookup$lra

  ## DT of unique exon 3 EAGs and their approximate probability
  ## of accurance simply based on how many alleles they stand
  ## for: eag_num_prob => enp
  enp <- ex3[, .(prob = .N/nrow(ex3)), by = eag_num]

  structure(memoise(function(a) {
    if (is.null(x = a) || is.na(x = a))
      return(eag_numbers(NA, NA))

    a1   <- allele(a, 1)
    exa1 <- expand_code(a1)
    rep1 <- ptn(exa1)

    a2   <- allele(a, 2)
    exa2 <- expand_code(a2)
    rep2 <- ptn(exa2)

    nums2.1 <- unique(ex2[detect(eag_allele, rep1), eag_num])
    nums2.2 <- unique(ex2[detect(eag_allele, rep2), eag_num])

    ## update exa and rep if it's a low resolution allele
    if (any(exa1 %in% lra)) {
      tmp_rep <- ptn(ex2[eag_num %in% nums2.1, eag_allele])
      tmp <- rle(sort(ex2[detect(eag_allele, tmp_rep), eag_num]))
      nums2.1 <- tmp$values[which.max(tmp$lengths)]
      exa1 <- ex2[eag_num %in% nums2.1, eag_allele]
      rep1 <- ptn(exa1)
    }

    if (any(exa2 %in% lra)) {
      tmp_rep <- ptn(ex2[eag_num %in% nums2.2, eag_allele])
      tmp <- rle(sort(ex2[detect(eag_allele, tmp_rep), eag_num]))
      nums2.2 <- tmp$values[which.max(tmp$lengths)]
      exa2 <- ex2[eag_num %in% nums2.2, eag_allele]
      rep2 <- ptn(exa2)
    }

    if (length(nums2.1) == 0L || length(nums2.2) == 0L) {
      return(eag_numbers(NA, NA))
    }

    if (length(nums2.1) > 1L) {
      warning("More than one EAG number for allele 1 exon 2. Using only the first!", call. = FALSE, immediate. = TRUE)
      nums2.1 <- nums2.1[1]
    }

    if (length(nums2.2) > 1L) {
      warning("More than one EAG number for allele 2 exon 2. Using only the first!", call. = FALSE, immediate. = TRUE)
      nums2.2 <- nums2.2[1]
    }

    ## Exon 3 gets special treatment due to the fact that some
    ## alleles may not be found in the lookup table but only
    ## exist as jokers.
    nums3.1 <- ex3[detect(eag_allele, rep1), eag_num]
    nums3.2 <- ex3[detect(eag_allele, rep2), eag_num]

    # If both alleles on exon 3 have a single EAG number
    # each, no further checks are required.
    if (length(nums3.1) != 1L || length(nums3.2) != 1L) {
      # Allele 1 and allele 2 have one or more EAG nums.
      if (none_empty(nums3.1, nums3.2)) {
        if (a1 %in% c("05:RGPW", "07:01:01G")) {
          nums3.1 <- unique(nums3.1[duplicated(nums3.1)])
        }
        if (a2 %in% c("05:RGPW", "07:01:01G")) {
          nums3.2 <- unique(nums3.2[duplicated(nums3.2)])
        }
        # Both alleles have ambiguity codes and an intersection
        # resulting in two distinct EAG numbers can be formed.
        if (maybe_exon_shuffling(a) && length(nums3 <- intersect(nums3.1, nums3.2)) == 2) {
          nums3.1 <- nums3[1]
          nums3.2 <- nums3[2]
        }
        else {
          if (length(nums3.1 <- unique(nums3.1)) > 1) {
            #message("EAG nums ", slash(nums3.1), " in allele 1 of ", slash(a))
            enp2 <- crosscheck_eag_nums(rep = rep1)
            nums3.1 <- sample(enp2$eag_num, 1, prob = enp2$prob)
          }
          if (length(nums3.2 <- unique(nums3.2)) > 1) {
            #message("EAG nums ", slash(nums3.2), " in allele 2 of ", slash(a))
            enp2 <- crosscheck_eag_nums(rep = rep2)
            nums3.2 <- sample(enp2$eag_num, 1, prob = enp2$prob)
          }
        }
      }
      # Only allele 1 has one or more EAG numbers, that is,
      # allele 2 is part of the joker or partials group.
      #
      # Since EAGs are defined with respect to exon 2, the subtypes of
      # an ambiguous allele on exon 3 can be spread across 2 EAGs. If
      # this is the case we pick these two EAGs
      #
      # If this is not the case we randomly pick a second EAG in
      # proportion to the numbers of alleles associated with an EAG.
      #
      # We have to make sure, however, that there is no intersection
      # (outside of the jokers/partials) between this EAG and the alleles at
      # exon 2
      else if (is_empty(nums3.2) && !is_empty(nums3.1)) {
        if (a1 == "05:RGPW") {
          nums3.1 <- nums3.2 <- unique(nums3.1[duplicated(nums3.1)])
        }
        else {
          if (length(nums3 <- unique(nums3.1)) == 2) {
            nums3.1 <- nums3[1]
            nums3.2 <- nums3[2]
          }
          else if (length(nums3) > 2) {
            #message("EAG nums ", slash(nums3), " in allele 1 of ", slash(a))
            enp2 <- crosscheck_nmdp_subtypes(rep = rep1)
            if (length(enp2) == 2) {
              nums3.1 <- sample(enp2[[1]]$eag_num, 1, prob = enp2[[1]]$prob)
              nums3.2 <- sample(enp2[[2]]$eag_num, 1, prob = enp2[[2]]$prob)
            }
            else {
              enp2 <- crosscheck_eag_nums(rep = rep1)
              nums3 <- sample(enp2$eag_num, 2, replace = TRUE, prob = enp2$prob)
              nums3.1 <- nums3[1]
              nums3.2 <- nums3[2]
            }
          }
          else {
            nums3.1 <- nums3
            enp2 <- if (!all(exa2 %in% jkr)) {
              crosscheck_eag_nums(rep = rep2)
            } else if (length(aa <- ex2[eag_num == nums2.1][!detect(eag_allele, rep1), eag_allele]) > 0) {
              ## check for additional diagonal allele matches
              rs <- enp[eag_num %in% setdiff(enp$eag_num, crosscheck_eag_nums(rep = ptn(aa))$eag_num)]
              rs[, prob := prob/sum(prob)]
            } else enp
            nums3.2 <- sample(enp2$eag_num, 1, prob = enp2$prob)
          }
        }
      }
      # Only allele 2 has one or more EAG numbers, that is,
      # allele 1 is part of the joker/partials group.
      #
      # Since EAGs are defined with respect to exon 2, the subtypes of
      # an ambiguous allele on exon 3 can be spread across 2 EAG. If
      # this is the case we pick these two EAGs.
      #
      # If this is not the case we randomly pick a second EAG in
      # proportion to the numbers of alleles associated with an EAG.
      #
      # We have to make sure, however, that there is no intersection
      # (outside of the jokers/partials) between this EAG and the alleles at
      # exon 2
      else if (is_empty(nums3.1) && !is_empty(nums3.2)) {
        if (a2 == "05:RGPW") {
          nums3.1 <- nums3.2 <- unique(nums3.2[duplicated(nums3.2)])
        }
        else {
          if (length(nums3 <- unique(nums3.2)) == 2) {
            nums3.1 <- nums3[1]
            nums3.2 <- nums3[2]
          }
          else if (length(nums3) > 2) {
            #message("EAG nums ", slash(nums3), " in allele 2 of ", slash(a))
            enp2 <- crosscheck_nmdp_subtypes(rep = rep2)
            if (length(enp2) == 2) {
              nums3.1 <- sample(enp2[[1]]$eag_num, 1, prob = enp2[[1]]$prob)
              nums3.2 <- sample(enp2[[2]]$eag_num, 1, prob = enp2[[2]]$prob)
            }
            else {
              enp2 <- crosscheck_eag_nums(rep = rep2)
              nums3 <- sample(enp2$eag_num, 2, replace = TRUE, prob = enp2$prob)
              nums3.1 <- nums3[1]
              nums3.2 <- nums3[2]
            }
          }
          else {
            nums3.2 <- nums3
            enp2 <- if (!all(exa1 %in% jkr)) {
              crosscheck_eag_nums(rep = rep1)
            } else if (length(aa <- ex2[eag_num == nums2.2][!detect(eag_allele, rep2), eag_allele]) > 0) {
              ## check for additional diagonal allele matches
              rs <- enp[eag_num %in% setdiff(enp$eag_num, crosscheck_eag_nums(rep = ptn(aa))$eag_num)]
              rs[, prob := prob/sum(prob)]
            } else enp

            nums3.1 <- sample(enp2$eag_num, 1, prob = enp2$prob)
          }
        }
      }
      # Neither allele has an EAG number because both alleles
      # are part of the joker group
      else {
        ## Randomly pick EAG numbers in proportion to the
        ## numbers of alleles associated with a number. Because
        ## both alleles are part of the joker group this should
        ## be safe.
        enp2 <- if (!all(exa1 %in% jkr)) {
          crosscheck_eag_nums(rep = rep1)
        } else enp
        nums3.1 <- sample(enp2$eag_num, 1, prob = enp2$prob)
        enp2 <- if (!all(exa2 %in% jkr)) {
          crosscheck_eag_nums(rep = rep2)
        } else enp
        nums3.2 <- sample(enp2$eag_num, 1, prob = enp2$prob)
      }
    }

    nums <- eag_numbers(
      e2 = c(nums2.1, nums2.2),
      e3 = c(nums3.1, nums3.2)
    )
    return(nums)
  }), class = c("remapper", "function"))
}

#' @export
print.remapper <- function(x, ...) {
  cat("function <remapper> (a)", sep = "")
}
DKMS-LSL/HLAsim documentation built on May 6, 2019, 1:17 p.m.