inst/data-raw/process/Core/phaseHLA_fun.R

#-- FUN ------------------------------------------------------------------------------------------------------ #
# Functions for phasing

# Change notes:
# 01/13/2020 replaced "ethnic.grp" with "Race" for new input data format

# Phasing with complete hi-res, non-ambiguous data using H.freq as reference
phase <- function(x, loci) {
  x <- as.list(x)
  race.code <- x$Race[1]
  loci.ls <- lapply(x[loci], unique) # prevent redundant combinations for homozygous loci
  # H is all possible combinations
  H <- do.call(expand.grid, list(loci.ls, stringsAsFactors = F))
  n <- nrow(H)
  n1 <- 1:(n/2)
  n2 <- n:(n/2+1)
  # Look up h.freq for the specified race
  h.freq <- sapply(split(H, 1:n), function(hkey) H.freq[hkey, get(race.freq[[race.code]])])
  if(race.code == "Multiracial") h.freq <- rowMeans(as.data.table(h.freq))
  h.freq[is.na(h.freq)] <- 0 # if haplotype not observed, then essentially 0
  # Get observed likelihood by pair
  pair.freq <- h.freq[n1] * h.freq[n2]
  max.pair.freq <- max(pair.freq)
  if(max.pair.freq > 0) {
    mostLikely <- which(pair.freq == max(pair.freq))[1]
  } else {
  # On occassions where l1 x l2 = 0 (because obs. freq of l1 or l2 is 0),
  # and no combination > 0, go with combination with largest l1 or l2
    mostLikely <- which(h.freq == max(h.freq))[1]
    if(mostLikely %in% n2) mostLikely <- which(n2 == mostLikely)
  }
  H$h.freq <- h.freq
  phased <- list(ID = x$ID[1], Race = race.code, `1` = H[n1, loci][mostLikely, ], `2` = H[n2, loci][mostLikely, ],
                 max.pair.freq = max.pair.freq, H = H)
  return(phased)
}

# Modified function to deal with missing C
phasexC <- function(x, loci = loci.xC) {
  x <- as.list(x)
  race.code <- x$Race[1]
  loci.ls <- lapply(x[loci], unique) # prevent redundant combinations for homozygous loci
  # H is all possible combinations
  H <- do.call(expand.grid, list(loci.ls, stringsAsFactors = F))
  n <- nrow(H)
  n1 <- 1:(n/2)
  n2 <- n:(n/2+1)
  # Look up h.freq for the specified race
  h.freq <- lapply(split(H, 1:n), function(hkey) H.freq[.(hkey$A, hkey$B, unique(C), hkey$DRB1, hkey$DQB1), get(race.freq[[race.code]])])
  h.freq <- sapply(h.freq, function(x) if(all(is.na(x))) 0 else max(x, na.rm = T))
  # Get observed likelihood by pair
  pair.freq <- h.freq[n1] * h.freq[n2]
  max.pair.freq <- max(pair.freq)
  if(max.pair.freq > 0) {
    mostLikely <- which(pair.freq == max(pair.freq))[1]
  } else {
    # On occassions where l1 x l2 = 0 (because obs. freq of l1 or l2 is 0),
    # and no combination > 0, go with combination with largest l1 or l2
    mostLikely <- which(h.freq == max(h.freq))[1]
    if(mostLikely %in% n2) mostLikely <- which(n2 == mostLikely)
  }
  H$h.freq <- h.freq
  phased <- list(ID = x$ID[1], Race = race.code, `1` = H[n1, loci][mostLikely, ], `2` = H[n2, loci][mostLikely, ],
                 max.pair.freq = max.pair.freq, H = H)
  return(phased)
}

# Modified function to deal with low-res entries
phaseA <- function(x, loci) {
  x <- as.list(x)
  race.code <- x$Race[1]
  loci.ls <- x[loci]
  # If low-res allele, get set of possible hi-res, e.g. 02 -> {02:01, 02:02, 02:03, ... }
  for(l in names(loci.ls)) loci.ls[[l]] <- sapply(loci.ls[[l]], simplify = F, # TO DO: re-write to avoid for-loop
                                                  function(a) if(nchar(a) == 2) grep(paste0(a, ":"), unique(H.freq[[l]]), val = T) else a)
  H <- unique(expand.grid(lapply(loci.ls, unlist, use.names = F), stringsAsFactors = F)) # a very large number of combns
  h.freq <- H.freq[H, get(race.freq[[race.code]])]
  if(race.code == "Multiracial") h.freq <- rowMeans(h.freq)
  h.freq[is.na(h.freq)] <- 0
  H$h.freq <- h.freq
  H <- H[which(H$h.freq > 0), ]
  H <- H[order(H$h.freq, decreasing = T), ]
  result <- comPair(H, loci.ls) # check for complementary pairs
  return(c(ID = x$ID[1], Race = race.code, result))
}

# At each locus of H1, find allele for H2 using list in loci.ls
getH2 <- function(H1, loci.ls) {
  H2 <- Map(otherAllele, H1, loci.ls) # H1 can map to multiple H2 when data is low-res
  H2 <- unique(expand.grid(H2, stringsAsFactors = F))
  return(H2)
}

otherAllele <- function(H1.allele, locus.i) {
  alleles <- names(locus.i)
  H2.allele <- which(is.na(pmatch(alleles, H1.allele))) # H2 allele is one NOT matched; works for hetero- or homozygous locus
  H2.allele <- locus.i[[H2.allele]] # can contain one or multiple
  return(H2.allele)
}

getFreq <- function(h, H) {
  where <- Reduce(`&`, Map(`%in%`, H[, names(H) != "h.freq"], h))
  if(any(where)) {
    H2 <- H[which(where), ]
    H2 <- H2[order(H2$h.freq), ][1, ]
  } else {
    H2 <- h[1, ]
    H2$h.freq <- 0
  }
  return(H2)
}

# Given H, already ordered by h.freq, compute frequencies for all valid pairs and return most likely pair
comPair <- function(H, loci.ls) {
  H1.ls <- lapply(split(H[, names(loci.ls)], 1:nrow(H)), as.list)
  H2 <- lapply(H1.ls, getH2, loci.ls = loci.ls)
  H2 <- lapply(H2, getFreq, H = H)
  H2 <- do.call(rbind, H2)
  pair.freq <- H$h.freq * H2$h.freq
  max.pair.freq <- max(pair.freq)
  mostLikely <- which(pair.freq == max.pair.freq)[1]
  return(list(`1` = H[mostLikely, names(loci.ls)], `2` = H2[mostLikely, names(loci.ls)], max.pair.freq = max.pair.freq, H = H))
}

# Main phasing function for DR/DQ using WH as reference
phase2 <- function(x, loci, ignoreRace = T, hasAmbiguous = F) {
  x <- as.list(x)
  race.code <- x$Race[1]
  loci.ls <- if(hasAmbiguous) xpandL(x, loci) else x[loci]
  H <- do.call(expand.grid, args = list(loci.ls, stringsAsFactors = F))
  n <- nrow(H)
  n1 <- 1:(n/2)
  n2 <- n:(n/2+1)
  if(ignoreRace | race.code == "Multiracial") {
    h.count <- sapply(split(H, 1:n), function(h) WH[h, sum(Count)])
  } else {
    h.count <- sapply(split(H, 1:n), function(h) WH[h][Pop2 == race.code, sum(Count)])
  }
  h.count[is.na(h.count)] <- 0 # If haplotype not observed, then essentially 0
  pair.freq <- h.count[n1] + h.count[n2]
  mostLikely <- which(pair.freq == max(pair.freq))[1]
  H$h.count <- h.count
  phased <- list(ID = x$ID[1], Race = race.code, `1` = H[n1, loci][mostLikely, ], `2` = H[n2, loci][mostLikely, ], H = H)
  return(phased)
}

# Re-formatting ambiguous entries
reformat <- function(l) {
  s <- strsplit(l, "/")[[1]]
  if(length(s) == 2) {
    s[2] <- paste0(substr(s[1], 1, 4), s[2])
  }
  return(s)
}

# Just a wrapper that calls reformatting for all loci necessary
xpandL <- function(x, loci) {
  x <- lapply(x[loci], function(l) unlist(lapply(l, reformat)))
  return(x)
}
avucoh/nPOD documentation built on April 1, 2020, 5:24 p.m.