#' Harmonise the alleles and effects between the exposure and outcome
#'
#' @description
#' In order to perform MR the effect of a SNP on an outcome and exposure must be harmonised to be
#' relative to the same allele.
#'
#' @details
#' Expects data in the format generated by \code{read_exposure_data} and \code{extract_outcome_data}.
#' This means the inputs must be dataframes with the following columns:
#' \code{outcome_dat}:
#' - SNP
#' - beta.outcome
#' - se.outcome
#' - effect_allele.outcome
#' - other_allele.outcome
#' - eaf.outcome
#' - outcome
#'
#' \code{exposure_dat}:
#' - SNP
#' - beta.exposure
#' - se.exposure
#' - effect_allele.exposure
#' - other_allele.exposure
#' - eaf.exposure
#'
#' @param exposure_dat Output from \code{read_exposure_data}
#' @param outcome_dat Output from \code{extract_outcome_data}
#' @param action Level of strictness in dealing with SNPs. 1=Assume all reference alleles are on the positive strand, i.e. do nothing (warning - this is very risky and is not recommended); 2=Try to infer positive strand alleles, using allele frequencies for palindromes; 3=Correct strand for non-palindromic SNPs, and drop all palindromic SNPs from the analysis. If a single value is passed then this action is applied to all outcomes. But multiple values can be supplied as a vector, each element relating to a different outcome.
#'
#' @keywords internal
#' @return Data frame with harmonised effects and alleles
harmonise_data <- function(exposure_dat, outcome_dat, action=2)
{
stopifnot(all(action %in% 1:3))
res.tab <- merge(outcome_dat, exposure_dat, by="SNP")
ncombinations <- length(unique(res.tab$id.outcome))
if(length(action) == 1)
{
action <- rep(action, ncombinations)
} else if(length(action) != ncombinations) {
stop("Action argument must be of length 1 (where the same action will be used for all outcomes), or number of unique id.outcome values (where each outcome will use a different action value)")
}
res.tab <- harmonise_cleanup_variables(res.tab)
d <- data.frame(id.outcome=unique(res.tab$id.outcome), action=action)
res.tab <- merge(res.tab, d, by="id.outcome")
fix.tab <- plyr::ddply(res.tab, c("id.exposure", "id.outcome"), function(x)
{
x <- plyr::mutate(x)
message("Harmonising ", x$exposure[1], " (", x$id.exposure[1], ") and ", x$outcome[1], " (", x$id.outcome[1], ")")
# SNP, A1, A2, B1, B2, betaA, betaB, fA, fB, tolerance, action
x <- harmonise_function_refactored(x, 0.08, x$action[1])
# x <- harmonise_function(x, x$action[1])
return(x)
})
mr_cols <- c("beta.exposure", "beta.outcome", "se.exposure", "se.outcome")
fix.tab$mr_keep[apply(fix.tab[, mr_cols], 1, function(x) any(is.na(x)))] <- FALSE
# fix.tab <- harmonise_make_snp_effects_positive(fix.tab)
if(!"samplesize.outcome" %in% names(fix.tab))
{
fix.tab$samplesize.outcome <- NA
}
return(fix.tab)
}
harmonise_cleanup_variables <- function(res.tab)
{
res.tab$beta.exposure <- as.numeric(res.tab$beta.exposure)
res.tab$beta.outcome <- as.numeric(res.tab$beta.outcome)
res.tab$eaf.exposure <- as.numeric(res.tab$eaf.exposure)
res.tab$eaf.outcome[res.tab$eaf.outcome=="NR"] <- NA
res.tab$eaf.outcome[res.tab$eaf.outcome=="NR "] <- NA
res.tab$eaf.outcome <- as.numeric(res.tab$eaf.outcome)
# convert alleles to upper case
res.tab$effect_allele.exposure <- toupper(res.tab$effect_allele.exposure)
res.tab$other_allele.exposure <- toupper(res.tab$other_allele.exposure)
res.tab$effect_allele.outcome <- toupper(res.tab$effect_allele.outcome)
res.tab$other_allele.outcome <- toupper(res.tab$other_allele.outcome)
res.tab$other_allele.outcome[res.tab$other_allele.outcome == ""] <- NA
return(res.tab)
}
## Refactoring
check_palindromic <- function(A1, A2)
{
(A1 == "T" & A2 == "A") |
(A1 == "A" & A2 == "T") |
(A1 == "G" & A2 == "C") |
(A1 == "C" & A2 == "G")
}
flip_alleles <- function(A1)
{
A2 <- A1
A2[A1 == "A"] <- "T"
A2[A1 == "T"] <- "A"
A2[A1 == "G"] <- "C"
A2[A1 == "C"] <- "G"
return(A2)
}
harmonise_22 <- function(SNP, A1, A2, B1, B2, betaA, betaB, fA, fB, tolerance, action)
{
if(length(SNP) == 0) return(data.frame())
# Find SNPs with alleles that match in A and B
status1 <- (A1 == B1) & (A2 == B2)
to_swap <- (A1 == B2) & (A2 == B1)
# If B's alleles are the wrong way round then swap
Btemp <- B1[to_swap]
B1[to_swap] <- B2[to_swap]
B2[to_swap] <- Btemp
betaB[to_swap] <- betaB[to_swap] * -1
fB[to_swap] <- 1 - fB[to_swap]
# Check again
status1 <- (A1 == B1) & (A2 == B2)
palindromic <- check_palindromic(A1, A2)
# If NOT palindromic and alleles DON'T match then try flipping
i <- !palindromic & !status1
B1[i] <- flip_alleles(B1[i])
B2[i] <- flip_alleles(B2[i])
status1 <- (A1 == B1) & (A2 == B2)
# If still NOT palindromic and alleles DON'T match then try swapping
i <- !palindromic & !status1
to_swap <- (A1 == B2) & (A2 == B1)
Btemp <- B1[to_swap]
B1[to_swap] <- B2[to_swap]
B2[to_swap] <- Btemp
betaB[to_swap] <- betaB[to_swap] * -1
fB[to_swap] <- 1 - fB[to_swap]
# Any SNPs left with unmatching alleles need to be removed
status1 <- (A1 == B1) & (A2 == B2)
remove <- !status1
# Now deal with palindromic SNPs
# If the frequency is within tolerance then they are ambiguous and need to be flagged
minf <- 0.5 - tolerance
maxf <- 0.5 + tolerance
tempfA <- fA
tempfB <- fB
tempfA[is.na(tempfA)] <- 0.5
tempfB[is.na(tempfB)] <- 0.5
ambiguousA <- tempfA > minf & tempfA < maxf
ambiguousB <- tempfB > minf & tempfB < maxf
# If action = 2 (flip alleles based on frequency) then flip and swap
if(action == 2)
{
status2 <- ((tempfA < 0.5 & tempfB > 0.5) | (tempfA > 0.5 & tempfB < 0.5)) & palindromic
to_swap <- status2 & !remove
betaB[to_swap] <- betaB[to_swap] * -1
fB[to_swap] <- 1 - fB[to_swap]
}
d <- data.frame(SNP=SNP, effect_allele.exposure=A1, other_allele.exposure=A2,
effect_allele.outcome=B1, other_allele.outcome=B2,
beta.exposure=betaA, beta.outcome=betaB,
eaf.exposure=fA, eaf.outcome=fB,
remove=remove, palindromic=palindromic, ambiguous=(ambiguousA|ambiguousB) & palindromic)
return(d)
}
# if a is not palindromic
# if a1 == b1
# if fA and fB similar
# b2 = a2
# else
# ambiguous
# else if a2 == b1
# if fA and 1-fB similar
# b2 = a1
# swap
# else
# ambiguous
# else if a1 != b1 & a2 != b1
# flip and return to top
# else a is palindromic
# remove
harmonise_21 <- function(SNP, A1, A2, B1, betaA, betaB, fA, fB, tolerance, action)
{
if(length(SNP) == 0) return(data.frame())
n <- length(A1)
B2 <- rep(NA, n)
ambiguous <- rep(FALSE, n)
palindromic <- check_palindromic(A1, A2)
remove <- palindromic
status1 <- A1 == B1
minf <- 0.5 - tolerance
maxf <- 0.5 + tolerance
tempfA <- fA
tempfB <- fB
tempfA[is.na(tempfA)] <- 0.5
tempfB[is.na(tempfB)] <- 0.5
freq_similar1 <- (tempfA < minf & tempfB < minf) | (tempfA > maxf & tempfB > maxf)
ambiguous[status1 & !freq_similar1] <- TRUE
B2[status1] <- A2[status1]
to_swap <- A2 == B1
freq_similar2 <- (tempfA < minf & tempfB > maxf) | (tempfA > maxf & tempfB < minf)
ambiguous[to_swap & !freq_similar2] <- TRUE
B2[to_swap] <- B1[to_swap]
B1[to_swap] <- A1[to_swap]
betaB[to_swap] <- betaB[to_swap] * -1
fB[to_swap] <- 1 - fB[to_swap]
to_flip <- A1 != B1 & A2 != B1
ambiguous[to_flip] <- TRUE
B1[to_flip] <- flip_alleles(B1[to_flip])
status1 <- A1 == B1
B2[status1] <- A2[status1]
to_swap <- A2 == B1
B2[to_swap] <- B1[to_swap]
B1[to_swap] <- A1[to_swap]
betaB[to_swap] <- betaB[to_swap] * -1
fB[to_swap] <- 1 - fB[to_swap]
d <- data.frame(SNP=SNP, effect_allele.exposure=A1, other_allele.exposure=A2, effect_allele.outcome=B1, other_allele.outcome=B2, beta.exposure=betaA, beta.outcome=betaB, eaf.exposure=fA, eaf.outcome=fB, remove=remove, palindromic=palindromic, ambiguous=ambiguous | palindromic)
return(d)
}
harmonise_12 <- function(SNP, A1, B1, B2, betaA, betaB, fA, fB, tolerance, action)
{
if(length(SNP) == 0) return(data.frame())
n <- length(A1)
A2 <- rep(NA, n)
ambiguous <- rep(FALSE, n)
palindromic <- check_palindromic(B1, B2)
remove <- palindromic
status1 <- A1 == B1
minf <- 0.5 - tolerance
maxf <- 0.5 + tolerance
tempfA <- fA
tempfB <- fB
tempfA[is.na(tempfA)] <- 0.5
tempfB[is.na(tempfB)] <- 0.5
freq_similar1 <- (tempfA < minf & tempfB < minf) | (tempfA > maxf & tempfB > maxf)
ambiguous[status1 & !freq_similar1] <- TRUE
A2[status1] <- B2[status1]
to_swap <- A1 == B2
freq_similar2 <- (tempfA < minf & tempfB > maxf) | (tempfA > maxf & tempfB < minf)
ambiguous[to_swap & !freq_similar2] <- TRUE
A2[to_swap] <- A1[to_swap]
A1[to_swap] <- B1[to_swap]
betaA[to_swap] <- betaA[to_swap] * -1
fA[to_swap] <- 1 - fA[to_swap]
to_flip <- A1 != B1 & A1 != B2
ambiguous[to_flip] <- TRUE
A1[to_flip] <- flip_alleles(A1[to_flip])
status1 <- A1 == B1
A2[status1] <- B2[status1]
to_swap <- B2 == A1
B2[to_swap] <- B1[to_swap]
B1[to_swap] <- A1[to_swap]
betaB[to_swap] <- betaB[to_swap] * -1
fB[to_swap] <- 1 - fB[to_swap]
d <- data.frame(SNP=SNP, effect_allele.exposure=A1, other_allele.exposure=A2, effect_allele.outcome=B1, other_allele.outcome=B2, beta.exposure=betaA, beta.outcome=betaB, eaf.exposure=fA, eaf.outcome=fB, remove=remove, palindromic=palindromic, ambiguous=ambiguous | palindromic)
return(d)
}
harmonise_11 <- function(SNP, A1, B1, betaA, betaB, fA, fB, tolerance, action)
{
if(length(SNP) == 0) return(data.frame())
n <- length(A1)
A2 <- rep(NA, n)
B2 <- rep(NA, n)
ambiguous <- rep(FALSE, n)
palindromic <- FALSE
status1 <- A1 == B1
remove <- !status1
minf <- 0.5 - tolerance
maxf <- 0.5 + tolerance
tempfA <- fA
tempfB <- fB
tempfA[is.na(tempfA)] <- 0.5
tempfB[is.na(tempfB)] <- 0.5
freq_similar1 <- (tempfA < minf & tempfB < minf) | (tempfA > maxf & tempfB > maxf)
ambiguous[status1 & !freq_similar1] <- TRUE
d <- data.frame(SNP=SNP, effect_allele.exposure=A1, other_allele.exposure=A2, effect_allele.outcome=B1, other_allele.outcome=B2, beta.exposure=betaA, beta.outcome=betaB, eaf.exposure=fA, eaf.outcome=fB, remove=remove, palindromic=palindromic, ambiguous=ambiguous | palindromic)
return(d)
}
harmonise_function_refactored <- function(dat, tolerance, action)
{
SNP <- dat$SNP
A1 <- dat$effect_allele.exposure
A2 <- dat$other_allele.exposure
B1 <- dat$effect_allele.outcome
B2 <- dat$other_allele.outcome
betaA <- dat$beta.exposure
betaB <- dat$beta.outcome
fA <- dat$eaf.exposure
fB <- dat$eaf.outcome
dat$abs.beta.exposure <- abs(dat$beta.exposure)
dat$abs.beta.outcome <- abs(dat$beta.outcome)
dat <- subset(dat, select=-c(effect_allele.exposure, other_allele.exposure,
effect_allele.outcome, other_allele.outcome, beta.exposure, beta.outcome, eaf.exposure, eaf.outcome))
i22 <- !is.na(A1) & !is.na(A2) & !is.na(B1) & !is.na(B2)
i21 <- !is.na(A1) & !is.na(A2) & !is.na(B1) & is.na(B2)
i12 <- !is.na(A1) & is.na(A2) & !is.na(B1) & !is.na(B2)
i11 <- !is.na(A1) & is.na(A2) & !is.na(B1) & is.na(B2)
d22 <- harmonise_22(SNP[i22], A1[i22], A2[i22], B1[i22], B2[i22], betaA[i22], betaB[i22], fA[i22], fB[i22], tolerance, action)
d21 <- harmonise_21(SNP[i21], A1[i21], A2[i21], B1[i21], betaA[i21], betaB[i21], fA[i21], fB[i21], tolerance, action)
d12 <- harmonise_12(SNP[i12], A1[i12], B1[i12], B2[i12], betaA[i12], betaB[i12], fA[i12], fB[i12], tolerance, action)
d11 <- harmonise_11(SNP[i11], A1[i11], B1[i11], betaA[i11], betaB[i11], fA[i11], fB[i11], tolerance, action)
d <- rbind(d21, d22, d12, d11)
d$abs.beta.exposure <- abs(d$beta.exposure)
d$abs.beta.outcome <- abs(d$beta.outcome)
dd <- merge(d, dat, by=c("SNP",
"abs.beta.exposure",
"abs.beta.outcome"), all.x=T)
if (nrow(dd) > nrow(d)) {
tmp1 <- table(dd$SNP)
tmp2 <- table(d$SNP)
ii <- names(which(tmp1 > tmp2))
d <- dd[-which(dd$SNP == ii), , drop = F]
} else
d <- dd
rm(dd)
d <- subset(d, select = -c(abs.beta.outcome, abs.beta.exposure))
d <- d[order(d$id.outcome), ]
d$mr_keep <- TRUE
if(action == 3)
{
# d1 <- subset(d, !palindromic & !remove & !ambiguous)
d$mr_keep[d$palindromic | d$remove | d$ambiguous] <- FALSE
if(any(d$palindromic))
{
message("Removing the following SNPs for being palindromic:\n", paste(d$SNP[d$palindromic], collapse=", "))
}
if(any(d$remove))
{
message("Removing the following SNPs for incompatible alleles:\n", paste(d$SNP[d$remove], collapse=", "))
}
if(any(d$ambiguous & !d$palindromic))
{
message("Removing the following SNPs for having incompatible allele frequencies:\n", paste(d$SNP[d$ambiguous], collapse=", "))
}
}
if(action == 2)
{
# d1 <- subset(d, !remove & !ambiguous)
d$mr_keep[d$remove | d$ambiguous] <- FALSE
if(any(d$remove))
{
message("Removing the following SNPs for incompatible alleles:\n", paste(d$SNP[d$remove], collapse=", "))
}
if(any(d$ambiguous))
{
message("Removing the following SNPs for being palindromic with intermediate allele frequencies:\n", paste(d$SNP[d$ambiguous], collapse=", "))
}
}
if(action == 1)
{
# d1 <- subset(d, !remove)
d$mr_keep[d$remove] <- FALSE
if(any(d$remove))
{
message("Removing the following SNPs for incompatible alleles:\n", paste(d$SNP[d$remove], collapse=", "))
}
}
# if(nrow(d1) != nrow(d))
# {
# message("Removing the following SNPs due to harmonising issues:\n",
# paste(subset(d, ! SNP %in% d1$SNP)$SNP, collapse="\n")
# )
# }
return(d)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.