inst/data-raw/process/Core/phaseHLA.R

library(data.table)

# -- DATA ------------------------------------------------------------------------------------------------------#

H.freq <- fread("A_C_B_DRB1_DQB1.tsv") # Reference data (https://frequency.nmdp.org/jsp/haploTypeFrequencies.jsp)
HLA <- fread("HiResHLAclean_ref.tsv") # nPOD Core HLA data

# Input file contains a mixture of data resolution and completeness
# hi-res loci notation ex: 04:01
# lo-res loci notation ex: 50
# ambiguous loci notation ex: 05:01/0

# -- ETC. ----------------------------------------------------------------------------------------------------- #

loci <- c("A", "B", "C", "DRB1", "DQB1")
coloci <- setNames(unlist(lapply(loci, paste0, c(".1", ".2"))), rep(loci, each = 2))
# maps the race labels from nPOD Core file to one from nmdp.org
race.freq <- list("Caucasian" = "CAU_freq", "African Am" = "AFA_freq", "Hispanic/Latino" = "HIS_freq",
                  "American Indian/Alaska Native" = "NAM_freq", "Asian" = "API_freq",
                  "Multiracial" = c("AFA_freq", "API_freq", "CAU_freq", "HIS_freq", "NAM_freq"))

# Minor cleaning and keying of reference table
H.freq[, (loci) := lapply(.SD, function(x) gsub("[^0-9:]", "", sapply(strsplit(x, "*", fixed = T), `[[`, 2))), .SDcols = loci]
setkeyv(H.freq, loci)

# -- Source functions ------------------------------------------------------------------------------------------#

source("phaseHLA_fun.R")

# -- PHASING 1 ------------------------------------------------------------------------------------------------ #

# Phasing with 5 loci ("A, "B", "C", "DRB1", "DQB1") -- get complete rows
A5 <- HLA[HLA[, Reduce(`&`, lapply(.SD, `!=`, "")), .SDcols = coloci], c("ID","ethnic.grp", coloci), with = F ]

# A few entries containing lower-resolution typing (though none with ambiguous alleles), excluded for now
# Proceed with the 5-loci-complete hi-res data
issue <- A5[, Reduce(`|`, lapply(.SD, function(x) nchar(x) != 5)), .SDcols = coloci]
# These will be processed differently, so exclude these entries for now
A5.2 <- A5[issue ]
A5 <- A5[!issue ]
A5 <- melt(A5, measure.vars = patterns(paste0("^", loci)), value.name = loci) # long format

A5.results <- lapply(split(A5, by = "ID"), phase, loci = loci)
A5.phased <- rbindlist(lapply(A5.results, function(x) as.data.table(x[1:5])))
setnames(A5.phased, c("ID", "ethnic.grp", paste0(rep(loci, 2), rep(c(".1", ".2"), each = 5)), "max.pair.freq"))

# -- PHASING 1.2 ---------------------------------------------------------------------------------- #

# Now deal with issues
A5.2 <- melt(A5.2, measure.vars = patterns(paste0("^", loci)), value.name = loci)

A5.2.results <- lapply(split(A5.2, by = "ID"), phaseA, loci = loci)
A5.2.phased <- rbindlist(lapply(A5.2.results, function(x) as.data.table(x[1:5])))
setnames(A5.2.phased, c("ID", "ethnic.grp", paste0(rep(loci, 2), rep(c(".1", ".2"), each = 5)), "max.pair.freq"))

# Append A5.2 results to main results
A5.phased <- rbind(A5.phased, A5.2.phased)

# -- PHASING 2 ------------------------------------------------------------------------------------------------ #

WH <- fread("world_haplotypes_cleaned.tsv")
setkeyv(WH, c("DRB1", "DQA1", "DQB1"))

drdq <- c("DRB1.1", "DRB1.2", "DQA1.1", "DQA1.2", "DQB1.1", "DQB1.2")
# Use cases with complete data, ignoring cases with ambiguous/missing/low-res alleles
DRDQ <- HLA[ HLA[, Reduce(`&`, lapply(.SD, function(x) nchar(x) == 5)), .SDcols = drdq], c("ID","ethnic.grp", drdq), with = F ]
DRDQ <- melt(DRDQ, measure.vars = patterns(c("DRB1", "DQA1", "DQB1")), value.name = c("DRB1", "DQA1", "DQB1"))

D3.results.T <- lapply(split(DRDQ, by = "ID"), phase2, loci = c("DRB1", "DQA1", "DQB1"))
D3.results.F <- lapply(split(DRDQ, by = "ID"), phase2, loci = c("DRB1", "DQA1", "DQB1"), ignoreRace = F)

D3.T.phased <- rbindlist(lapply(D3.results.T, function(x) as.data.table(x[c(1,3:4)])))
setnames(D3.T.phased, c("ID", unlist(lapply(c(".1", ".2"), function(x) paste0(c("DRB1", "DQA1", "DQB1"), x)))))

D3.F.phased <- rbindlist(lapply(D3.results.F, function(x) as.data.table(x[1:2])))
setnames(D3.F.phased, unlist(lapply(c(".1", ".2"), function(x) paste0(c("DRB1", "DQA1", "DQB1"), x))))

# See how many homozygous:
D3.T.phased[DRB1.1 == DRB1.2 & DQB1.1 == DQB1.2]

# View discrepant results for phasing -- only 2 cases are actually different
diff <- fsetdiff(A5.phased[, c("ID", drdq[c(1:2, 5:6)]), with = F], D3.T.phased[, c("ID", drdq[c(1:2, 5:6)]), with = F])
inverted <- diff$ID[which(Reduce(`&`, Map(`==`, A5.phased[match(diff$ID, ID), .(DRB1.1, DRB1.2, DQB1.1, DQB1.2)],
                                        D3.T.phased[match(diff$ID, ID), .(DRB1.2, DRB1.1, DQB1.2, DQB1.1)]))) ]
# Manually reassign cases
# 6007 differs between A5.phased and D3.T.phased but results with "ignoreRace = F" is consistent with A5.phased
D3.T.phased[ID == 6007, c("DRB1.1", "DRB1.2", "DQB1.1", "DQB1.2") := A5.phased[ID == 6007, .(DRB1.1, DRB1.2, DQB1.1, DQB1.2)]]
for(i in inverted) D3.T.phased[ID == i, c("DRB1.1", "DRB1.2", "DQB1.1", "DQB1.2") := A5.phased[ID == i, .(DRB1.1, DRB1.2, DQB1.1, DQB1.2)]]

phased <- merge(D3.T.phased, A5.phased, by = c("ID", "DRB1.1", "DQB1.1", "DRB1.2", "DQB1.2"), all = T)

# One row with missing data / phased[ID == 6058]
x6058 <- HLA[ID == 6058] # impute
x6058 <- melt(x6058, measure.vars = patterns(c("DRB1", "DQA1", "DQB1")), value.name = c("DRB1", "DQA1", "DQB1"))
x6058 <- phase2(x6058, loci = c("DRB1", "DQA1", "DQB1"))[1:3]

phased[ID == 6058, c("DQA1.1", "DQA1.2") := .("05:01", "04:01")]

# -- PHASING 2.2 ------------------------------------------------------------------------------------------------ #

# Review the cases still not phased
D3.ambig <- HLA[!ID %in% phased$ID]
D3.ambig <-D3.ambig[ D3.ambig[, Reduce(`&`, lapply(.SD, function(x) nchar(x) > 4)), .SDcols = drdq] ]
D3.ambig <- melt(D3.ambig, measure.vars = patterns(c("DRB1", "DQA1", "DQB1")), value.name = c("DRB1", "DQA1", "DQB1"))

D3.ambig.results <- lapply(split(D3.ambig, by = "ID"), phase2, loci = c("DRB1", "DQA1", "DQB1"), ignoreRace = F, hasAmbiguous = T)
D3.ambig.phased <- rbindlist(lapply(D3.ambig.results, function(x) as.data.table(x[1:4])))
setnames(D3.ambig.phased, c("ID", "ethnic.grp", unlist(lapply(c(".1", ".2"), function(x) paste0(c("DRB1", "DQA1", "DQB1"), x)))))

# -- FINAL ------------------------------------------------------------------------------------------------------ #

phased <- rbind(phased, D3.ambig.phased, use.names = T, fill = T)
phasedL <- melt(phased, measure.vars = patterns(paste0("^", c(LETTERS[1:3], c("DRB1", "DQA1", "DQB1")))),
                value.name = c(LETTERS[1:3], c("DRB1", "DQA1", "DQB1")), variable.name = "H")
write.table(phased, file = "phased.tsv", sep = "\t", row.names = F, quote = F)
write.table(phasedL, file = "phasedL.tsv", sep = "\t", row.names = F, quote = F)
avucoh/nPOD documentation built on April 1, 2020, 5:24 p.m.