#-- FUN ------------------------------------------------------------------------------------------------------ #
# Functions for phasing
# Change notes:
# 01/13/2020 replaced "ethnic.grp" with "Race" for new input data format
# Phasing with complete hi-res, non-ambiguous data using H.freq as reference
phase <- function(x, loci) {
x <- as.list(x)
race.code <- x$Race[1]
loci.ls <- lapply(x[loci], unique) # prevent redundant combinations for homozygous loci
# H is all possible combinations
H <- do.call(expand.grid, list(loci.ls, stringsAsFactors = F))
n <- nrow(H)
n1 <- 1:(n/2)
n2 <- n:(n/2+1)
# Look up h.freq for the specified race
h.freq <- sapply(split(H, 1:n), function(hkey) H.freq[hkey, get(race.freq[[race.code]])])
if(race.code == "Multiracial") h.freq <- rowMeans(as.data.table(h.freq))
h.freq[is.na(h.freq)] <- 0 # if haplotype not observed, then essentially 0
# Get observed likelihood by pair
pair.freq <- h.freq[n1] * h.freq[n2]
max.pair.freq <- max(pair.freq)
if(max.pair.freq > 0) {
mostLikely <- which(pair.freq == max(pair.freq))[1]
} else {
# On occassions where l1 x l2 = 0 (because obs. freq of l1 or l2 is 0),
# and no combination > 0, go with combination with largest l1 or l2
mostLikely <- which(h.freq == max(h.freq))[1]
if(mostLikely %in% n2) mostLikely <- which(n2 == mostLikely)
}
H$h.freq <- h.freq
phased <- list(ID = x$ID[1], Race = race.code, `1` = H[n1, loci][mostLikely, ], `2` = H[n2, loci][mostLikely, ],
max.pair.freq = max.pair.freq, H = H)
return(phased)
}
# Modified function to deal with missing C
phasexC <- function(x, loci = loci.xC) {
x <- as.list(x)
race.code <- x$Race[1]
loci.ls <- lapply(x[loci], unique) # prevent redundant combinations for homozygous loci
# H is all possible combinations
H <- do.call(expand.grid, list(loci.ls, stringsAsFactors = F))
n <- nrow(H)
n1 <- 1:(n/2)
n2 <- n:(n/2+1)
# Look up h.freq for the specified race
h.freq <- lapply(split(H, 1:n), function(hkey) H.freq[.(hkey$A, hkey$B, unique(C), hkey$DRB1, hkey$DQB1), get(race.freq[[race.code]])])
h.freq <- sapply(h.freq, function(x) if(all(is.na(x))) 0 else max(x, na.rm = T))
# Get observed likelihood by pair
pair.freq <- h.freq[n1] * h.freq[n2]
max.pair.freq <- max(pair.freq)
if(max.pair.freq > 0) {
mostLikely <- which(pair.freq == max(pair.freq))[1]
} else {
# On occassions where l1 x l2 = 0 (because obs. freq of l1 or l2 is 0),
# and no combination > 0, go with combination with largest l1 or l2
mostLikely <- which(h.freq == max(h.freq))[1]
if(mostLikely %in% n2) mostLikely <- which(n2 == mostLikely)
}
H$h.freq <- h.freq
phased <- list(ID = x$ID[1], Race = race.code, `1` = H[n1, loci][mostLikely, ], `2` = H[n2, loci][mostLikely, ],
max.pair.freq = max.pair.freq, H = H)
return(phased)
}
# Modified function to deal with low-res entries
phaseA <- function(x, loci) {
x <- as.list(x)
race.code <- x$Race[1]
loci.ls <- x[loci]
# If low-res allele, get set of possible hi-res, e.g. 02 -> {02:01, 02:02, 02:03, ... }
for(l in names(loci.ls)) loci.ls[[l]] <- sapply(loci.ls[[l]], simplify = F, # TO DO: re-write to avoid for-loop
function(a) if(nchar(a) == 2) grep(paste0(a, ":"), unique(H.freq[[l]]), val = T) else a)
H <- unique(expand.grid(lapply(loci.ls, unlist, use.names = F), stringsAsFactors = F)) # a very large number of combns
h.freq <- H.freq[H, get(race.freq[[race.code]])]
if(race.code == "Multiracial") h.freq <- rowMeans(h.freq)
h.freq[is.na(h.freq)] <- 0
H$h.freq <- h.freq
H <- H[which(H$h.freq > 0), ]
H <- H[order(H$h.freq, decreasing = T), ]
result <- comPair(H, loci.ls) # check for complementary pairs
return(c(ID = x$ID[1], Race = race.code, result))
}
# At each locus of H1, find allele for H2 using list in loci.ls
getH2 <- function(H1, loci.ls) {
H2 <- Map(otherAllele, H1, loci.ls) # H1 can map to multiple H2 when data is low-res
H2 <- unique(expand.grid(H2, stringsAsFactors = F))
return(H2)
}
otherAllele <- function(H1.allele, locus.i) {
alleles <- names(locus.i)
H2.allele <- which(is.na(pmatch(alleles, H1.allele))) # H2 allele is one NOT matched; works for hetero- or homozygous locus
H2.allele <- locus.i[[H2.allele]] # can contain one or multiple
return(H2.allele)
}
getFreq <- function(h, H) {
where <- Reduce(`&`, Map(`%in%`, H[, names(H) != "h.freq"], h))
if(any(where)) {
H2 <- H[which(where), ]
H2 <- H2[order(H2$h.freq), ][1, ]
} else {
H2 <- h[1, ]
H2$h.freq <- 0
}
return(H2)
}
# Given H, already ordered by h.freq, compute frequencies for all valid pairs and return most likely pair
comPair <- function(H, loci.ls) {
H1.ls <- lapply(split(H[, names(loci.ls)], 1:nrow(H)), as.list)
H2 <- lapply(H1.ls, getH2, loci.ls = loci.ls)
H2 <- lapply(H2, getFreq, H = H)
H2 <- do.call(rbind, H2)
pair.freq <- H$h.freq * H2$h.freq
max.pair.freq <- max(pair.freq)
mostLikely <- which(pair.freq == max.pair.freq)[1]
return(list(`1` = H[mostLikely, names(loci.ls)], `2` = H2[mostLikely, names(loci.ls)], max.pair.freq = max.pair.freq, H = H))
}
# Main phasing function for DR/DQ using WH as reference
phase2 <- function(x, loci, ignoreRace = T, hasAmbiguous = F) {
x <- as.list(x)
race.code <- x$Race[1]
loci.ls <- if(hasAmbiguous) xpandL(x, loci) else x[loci]
H <- do.call(expand.grid, args = list(loci.ls, stringsAsFactors = F))
n <- nrow(H)
n1 <- 1:(n/2)
n2 <- n:(n/2+1)
if(ignoreRace | race.code == "Multiracial") {
h.count <- sapply(split(H, 1:n), function(h) WH[h, sum(Count)])
} else {
h.count <- sapply(split(H, 1:n), function(h) WH[h][Pop2 == race.code, sum(Count)])
}
h.count[is.na(h.count)] <- 0 # If haplotype not observed, then essentially 0
pair.freq <- h.count[n1] + h.count[n2]
mostLikely <- which(pair.freq == max(pair.freq))[1]
H$h.count <- h.count
phased <- list(ID = x$ID[1], Race = race.code, `1` = H[n1, loci][mostLikely, ], `2` = H[n2, loci][mostLikely, ], H = H)
return(phased)
}
# Re-formatting ambiguous entries
reformat <- function(l) {
s <- strsplit(l, "/")[[1]]
if(length(s) == 2) {
s[2] <- paste0(substr(s[1], 1, 4), s[2])
}
return(s)
}
# Just a wrapper that calls reformatting for all loci necessary
xpandL <- function(x, loci) {
x <- lapply(x[loci], function(l) unlist(lapply(l, reformat)))
return(x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.