GRAPPLE-beta-: R Package for MR Framework GRAPPLE

Documented in harmonise_data

#' Harmonise the alleles and effects between the exposure and outcome
#'
#' @description
#' In order to perform MR the effect of a SNP on an outcome and exposure must be harmonised to be
#' relative to the same allele. 
#'
#' @details
#' Expects data in the format generated by \code{read_exposure_data} and \code{extract_outcome_data}. 
#' This means the inputs must be dataframes with the following columns:
#' \code{outcome_dat}:
#' - SNP
#' - beta.outcome
#' - se.outcome
#' - effect_allele.outcome
#' - other_allele.outcome
#' - eaf.outcome
#' - outcome
#'
#' \code{exposure_dat}:
#' - SNP
#' - beta.exposure
#' - se.exposure
#' - effect_allele.exposure
#' - other_allele.exposure
#' - eaf.exposure
#'
#' @param exposure_dat Output from \code{read_exposure_data}
#' @param outcome_dat Output from \code{extract_outcome_data}
#' @param action Level of strictness in dealing with SNPs. 1=Assume all reference alleles are on the positive strand, i.e. do nothing (warning - this is very risky and is not recommended); 2=Try to infer positive strand alleles, using allele frequencies for palindromes; 3=Correct strand for non-palindromic SNPs, and drop all palindromic SNPs from the analysis. If a single value is passed then this action is applied to all outcomes. But multiple values can be supplied as a vector, each element relating to a different outcome.
#'
#' @keywords internal
#' @return Data frame with harmonised effects and alleles
harmonise_data <- function(exposure_dat, outcome_dat, action=2)
{
	stopifnot(all(action %in% 1:3))

	res.tab <- merge(outcome_dat, exposure_dat, by="SNP")
	ncombinations <- length(unique(res.tab$id.outcome))
	if(length(action) == 1)
	{
		action <- rep(action, ncombinations)
	} else if(length(action) != ncombinations) {
		stop("Action argument must be of length 1 (where the same action will be used for all outcomes), or number of unique id.outcome values (where each outcome will use a different action value)")
	}

	res.tab <- harmonise_cleanup_variables(res.tab)

	d <- data.frame(id.outcome=unique(res.tab$id.outcome), action=action)
	res.tab <- merge(res.tab, d, by="id.outcome")

	fix.tab <- plyr::ddply(res.tab, c("id.exposure", "id.outcome"), function(x)
	{
		x <- plyr::mutate(x)
		message("Harmonising ", x$exposure[1], " (", x$id.exposure[1], ") and ", x$outcome[1], " (", x$id.outcome[1], ")")

# SNP, A1, A2, B1, B2, betaA, betaB, fA, fB, tolerance, action		
		x <- harmonise_function_refactored(x, 0.08, x$action[1])
		# x <- harmonise_function(x, x$action[1])
		return(x)
	})
	mr_cols <- c("beta.exposure", "beta.outcome", "se.exposure", "se.outcome")
	fix.tab$mr_keep[apply(fix.tab[, mr_cols], 1, function(x) any(is.na(x)))] <- FALSE
	# fix.tab <- harmonise_make_snp_effects_positive(fix.tab)
	if(!"samplesize.outcome" %in% names(fix.tab))
	{
		fix.tab$samplesize.outcome <- NA
	}
	return(fix.tab)
}





harmonise_cleanup_variables <- function(res.tab)
{
	res.tab$beta.exposure <- as.numeric(res.tab$beta.exposure)
	res.tab$beta.outcome <- as.numeric(res.tab$beta.outcome)
	res.tab$eaf.exposure <- as.numeric(res.tab$eaf.exposure)
	res.tab$eaf.outcome[res.tab$eaf.outcome=="NR"] <- NA
	res.tab$eaf.outcome[res.tab$eaf.outcome=="NR "] <- NA
	res.tab$eaf.outcome <- as.numeric(res.tab$eaf.outcome)

	# convert alleles to upper case
	res.tab$effect_allele.exposure <- toupper(res.tab$effect_allele.exposure)
	res.tab$other_allele.exposure <- toupper(res.tab$other_allele.exposure)
	res.tab$effect_allele.outcome <- toupper(res.tab$effect_allele.outcome)
	res.tab$other_allele.outcome <- toupper(res.tab$other_allele.outcome)
	res.tab$other_allele.outcome[res.tab$other_allele.outcome == ""] <- NA
	
	return(res.tab)	
}


## Refactoring

check_palindromic <- function(A1, A2)
{
	(A1 == "T" & A2 == "A") |
	(A1 == "A" & A2 == "T") |
	(A1 == "G" & A2 == "C") |
	(A1 == "C" & A2 == "G")
}


flip_alleles <- function(A1)
{
	A2 <- A1
	A2[A1 == "A"] <- "T"
	A2[A1 == "T"] <- "A"
	A2[A1 == "G"] <- "C"
	A2[A1 == "C"] <- "G"
	return(A2)
}


harmonise_22 <- function(SNP, A1, A2, B1, B2, betaA, betaB, fA, fB, tolerance, action)
{
	if(length(SNP) == 0) return(data.frame())

	# Find SNPs with alleles that match in A and B
	status1 <- (A1 == B1) & (A2 == B2)
	to_swap <- (A1 == B2) & (A2 == B1)

	# If B's alleles are the wrong way round then swap
	Btemp <- B1[to_swap]
	B1[to_swap] <- B2[to_swap]
	B2[to_swap] <- Btemp
	betaB[to_swap] <- betaB[to_swap] * -1
	fB[to_swap] <- 1 - fB[to_swap]

	# Check again
	status1 <- (A1 == B1) & (A2 == B2)
	palindromic <- check_palindromic(A1, A2)

	# If NOT palindromic and alleles DON'T match then try flipping
	i <- !palindromic & !status1
	B1[i] <- flip_alleles(B1[i])
	B2[i] <- flip_alleles(B2[i])
	status1 <- (A1 == B1) & (A2 == B2)

	# If still NOT palindromic and alleles DON'T match then try swapping
	i <- !palindromic & !status1
	to_swap <- (A1 == B2) & (A2 == B1)
	Btemp <- B1[to_swap]
	B1[to_swap] <- B2[to_swap]
	B2[to_swap] <- Btemp
	betaB[to_swap] <- betaB[to_swap] * -1
	fB[to_swap] <- 1 - fB[to_swap]

	# Any SNPs left with unmatching alleles need to be removed
	status1 <- (A1 == B1) & (A2 == B2)
	remove <- !status1

	# Now deal with palindromic SNPs
	# If the frequency is within tolerance then they are ambiguous and need to be flagged
	minf <- 0.5 - tolerance
	maxf <- 0.5 + tolerance
	tempfA <- fA
	tempfB <- fB
	tempfA[is.na(tempfA)] <- 0.5
	tempfB[is.na(tempfB)] <- 0.5
	ambiguousA <- tempfA > minf & tempfA < maxf
	ambiguousB <- tempfB > minf & tempfB < maxf

	# If action = 2 (flip alleles based on frequency) then flip and swap
	if(action == 2)
	{
		status2 <- ((tempfA < 0.5 & tempfB > 0.5) | (tempfA > 0.5 & tempfB < 0.5)) & palindromic
		to_swap <- status2 & !remove
		betaB[to_swap] <- betaB[to_swap] * -1
		fB[to_swap] <- 1 - fB[to_swap]
	}

	d <- data.frame(SNP=SNP, effect_allele.exposure=A1, other_allele.exposure=A2, 
					effect_allele.outcome=B1, other_allele.outcome=B2, 
					beta.exposure=betaA, beta.outcome=betaB, 
					eaf.exposure=fA, eaf.outcome=fB, 
					remove=remove, palindromic=palindromic, ambiguous=(ambiguousA|ambiguousB) & palindromic)
	return(d)
}



# if a is not palindromic
# 	if a1 == b1
# 		if fA and fB similar
# 			b2 = a2
# 		else
# 			ambiguous

# 	else if a2 == b1
# 		if fA and 1-fB similar
# 			b2 = a1
# 			swap
# 		else
# 			ambiguous

# 	else if a1 != b1 & a2 != b1
# 		flip and return to top

# else a is palindromic
# 	remove


harmonise_21 <- function(SNP, A1, A2, B1, betaA, betaB, fA, fB, tolerance, action)
{
	if(length(SNP) == 0) return(data.frame())

	n <- length(A1)
	B2 <- rep(NA, n)
	ambiguous <- rep(FALSE, n)
	palindromic <- check_palindromic(A1, A2)
	remove <- palindromic

	status1 <- A1 == B1
	minf <- 0.5 - tolerance
	maxf <- 0.5 + tolerance

	tempfA <- fA
	tempfB <- fB
	tempfA[is.na(tempfA)] <- 0.5
	tempfB[is.na(tempfB)] <- 0.5

	freq_similar1 <- (tempfA < minf & tempfB < minf) | (tempfA > maxf & tempfB > maxf)
	ambiguous[status1 & !freq_similar1] <- TRUE

	B2[status1] <- A2[status1]

	to_swap <- A2 == B1
	freq_similar2 <- (tempfA < minf & tempfB > maxf) | (tempfA > maxf & tempfB < minf)

	ambiguous[to_swap & !freq_similar2] <- TRUE
	B2[to_swap] <- B1[to_swap]
	B1[to_swap] <- A1[to_swap]
	betaB[to_swap] <- betaB[to_swap] * -1
	fB[to_swap] <- 1 - fB[to_swap]

	to_flip <- A1 != B1 & A2 != B1

	ambiguous[to_flip] <- TRUE

	B1[to_flip] <- flip_alleles(B1[to_flip])
	status1 <- A1 == B1
	B2[status1] <- A2[status1]

	to_swap <- A2 == B1
	B2[to_swap] <- B1[to_swap]
	B1[to_swap] <- A1[to_swap]
	betaB[to_swap] <- betaB[to_swap] * -1
	fB[to_swap] <- 1 - fB[to_swap]


	d <- data.frame(SNP=SNP, effect_allele.exposure=A1, other_allele.exposure=A2, effect_allele.outcome=B1, other_allele.outcome=B2, beta.exposure=betaA, beta.outcome=betaB, eaf.exposure=fA, eaf.outcome=fB, remove=remove, palindromic=palindromic, ambiguous=ambiguous | palindromic)

	return(d)

}

harmonise_12 <- function(SNP, A1, B1, B2, betaA, betaB, fA, fB, tolerance, action)
{
	if(length(SNP) == 0) return(data.frame())

	n <- length(A1)
	A2 <- rep(NA, n)
	ambiguous <- rep(FALSE, n)
	palindromic <- check_palindromic(B1, B2)
	remove <- palindromic

	status1 <- A1 == B1
	minf <- 0.5 - tolerance
	maxf <- 0.5 + tolerance

	tempfA <- fA
	tempfB <- fB
	tempfA[is.na(tempfA)] <- 0.5
	tempfB[is.na(tempfB)] <- 0.5

	freq_similar1 <- (tempfA < minf & tempfB < minf) | (tempfA > maxf & tempfB > maxf)
	ambiguous[status1 & !freq_similar1] <- TRUE

	A2[status1] <- B2[status1]

	to_swap <- A1 == B2
	freq_similar2 <- (tempfA < minf & tempfB > maxf) | (tempfA > maxf & tempfB < minf)

	ambiguous[to_swap & !freq_similar2] <- TRUE
	A2[to_swap] <- A1[to_swap]
	A1[to_swap] <- B1[to_swap]
	betaA[to_swap] <- betaA[to_swap] * -1
	fA[to_swap] <- 1 - fA[to_swap]

	to_flip <- A1 != B1 & A1 != B2

	ambiguous[to_flip] <- TRUE

	A1[to_flip] <- flip_alleles(A1[to_flip])
	status1 <- A1 == B1
	A2[status1] <- B2[status1]

	to_swap <- B2 == A1
	B2[to_swap] <- B1[to_swap]
	B1[to_swap] <- A1[to_swap]
	betaB[to_swap] <- betaB[to_swap] * -1
	fB[to_swap] <- 1 - fB[to_swap]


	d <- data.frame(SNP=SNP, effect_allele.exposure=A1, other_allele.exposure=A2, effect_allele.outcome=B1, other_allele.outcome=B2, beta.exposure=betaA, beta.outcome=betaB, eaf.exposure=fA, eaf.outcome=fB, remove=remove, palindromic=palindromic, ambiguous=ambiguous | palindromic)

	return(d)

}


harmonise_11 <- function(SNP, A1, B1, betaA, betaB, fA, fB, tolerance, action)
{
	if(length(SNP) == 0) return(data.frame())

	n <- length(A1)
	A2 <- rep(NA, n)
	B2 <- rep(NA, n)
	ambiguous <- rep(FALSE, n)
	palindromic <- FALSE

	status1 <- A1 == B1
	remove <- !status1

	minf <- 0.5 - tolerance
	maxf <- 0.5 + tolerance

	tempfA <- fA
	tempfB <- fB
	tempfA[is.na(tempfA)] <- 0.5
	tempfB[is.na(tempfB)] <- 0.5

	freq_similar1 <- (tempfA < minf & tempfB < minf) | (tempfA > maxf & tempfB > maxf)
	ambiguous[status1 & !freq_similar1] <- TRUE

	d <- data.frame(SNP=SNP, effect_allele.exposure=A1, other_allele.exposure=A2, effect_allele.outcome=B1, other_allele.outcome=B2, beta.exposure=betaA, beta.outcome=betaB, eaf.exposure=fA, eaf.outcome=fB, remove=remove, palindromic=palindromic, ambiguous=ambiguous | palindromic)

	return(d)

}


harmonise_function_refactored <- function(dat, tolerance, action)
{
	SNP <- dat$SNP
	A1 <- dat$effect_allele.exposure
	A2 <- dat$other_allele.exposure
	B1 <- dat$effect_allele.outcome
	B2 <- dat$other_allele.outcome
	betaA <- dat$beta.exposure
	betaB <- dat$beta.outcome
	fA <- dat$eaf.exposure
	fB <- dat$eaf.outcome
  dat$abs.beta.exposure <- abs(dat$beta.exposure)
  dat$abs.beta.outcome <- abs(dat$beta.outcome)
	dat <- subset(dat, select=-c(effect_allele.exposure, other_allele.exposure, 
								 effect_allele.outcome, other_allele.outcome, beta.exposure, beta.outcome, eaf.exposure, eaf.outcome))
	
	i22 <- !is.na(A1) & !is.na(A2) & !is.na(B1) & !is.na(B2)
	i21 <- !is.na(A1) & !is.na(A2) & !is.na(B1) & is.na(B2)
	i12 <- !is.na(A1) & is.na(A2) & !is.na(B1) & !is.na(B2)
	i11 <- !is.na(A1) & is.na(A2) & !is.na(B1) & is.na(B2)

	d22 <- harmonise_22(SNP[i22], A1[i22], A2[i22], B1[i22], B2[i22], betaA[i22], betaB[i22], fA[i22], fB[i22], tolerance, action)
	d21 <- harmonise_21(SNP[i21], A1[i21], A2[i21], B1[i21], betaA[i21], betaB[i21], fA[i21], fB[i21], tolerance, action)
	d12 <- harmonise_12(SNP[i12], A1[i12], B1[i12], B2[i12], betaA[i12], betaB[i12], fA[i12], fB[i12], tolerance, action)
	d11 <- harmonise_11(SNP[i11], A1[i11], B1[i11], betaA[i11], betaB[i11], fA[i11], fB[i11], tolerance, action)

	d <- rbind(d21, d22, d12, d11)
  d$abs.beta.exposure <- abs(d$beta.exposure)  
  d$abs.beta.outcome <- abs(d$beta.outcome)

	dd <- merge(d, dat, by=c("SNP", 
                          "abs.beta.exposure", 
                          "abs.beta.outcome"), all.x=T)
	if (nrow(dd) > nrow(d)) {
		tmp1 <- table(dd$SNP)
		tmp2 <- table(d$SNP)
		ii <- names(which(tmp1 > tmp2))
		d <- dd[-which(dd$SNP == ii), , drop = F]
	} else
		d <- dd
	rm(dd)
  	d <- subset(d, select = -c(abs.beta.outcome, abs.beta.exposure))
	d <- d[order(d$id.outcome), ]
	d$mr_keep <- TRUE

	if(action == 3)
	{
		# d1 <- subset(d, !palindromic & !remove & !ambiguous)
		d$mr_keep[d$palindromic | d$remove | d$ambiguous] <- FALSE
		if(any(d$palindromic))
		{
			message("Removing the following SNPs for being palindromic:\n", paste(d$SNP[d$palindromic], collapse=", "))
		}
		if(any(d$remove))
		{
			message("Removing the following SNPs for incompatible alleles:\n", paste(d$SNP[d$remove], collapse=", "))
		}
		if(any(d$ambiguous & !d$palindromic))
		{
			message("Removing the following SNPs for having incompatible allele frequencies:\n", paste(d$SNP[d$ambiguous], collapse=", "))
		}
	}
	if(action == 2)
	{
		# d1 <- subset(d, !remove & !ambiguous)
		d$mr_keep[d$remove | d$ambiguous] <- FALSE
		if(any(d$remove))
		{
			message("Removing the following SNPs for incompatible alleles:\n", paste(d$SNP[d$remove], collapse=", "))
		}
		if(any(d$ambiguous))
		{
			message("Removing the following SNPs for being palindromic with intermediate allele frequencies:\n", paste(d$SNP[d$ambiguous], collapse=", "))
		}
	}
	if(action == 1)
	{
		# d1 <- subset(d, !remove)
		d$mr_keep[d$remove] <- FALSE
		if(any(d$remove))
		{
			message("Removing the following SNPs for incompatible alleles:\n", paste(d$SNP[d$remove], collapse=", "))
		}
	}

	# if(nrow(d1) != nrow(d))
	# {
	# 	message("Removing the following SNPs due to harmonising issues:\n",
	# 		paste(subset(d, ! SNP %in% d1$SNP)$SNP, collapse="\n")
	# 	)
	# }


	return(d)
}