#' Ensure A1 & A2 are correctly named, if GWAS SNP constructed as
#' Alternative/Reference or Risk/Nonrisk alleles these SNPs will need to be
#' converted to Reference/Alternative or Nonrisk/Risk. Here non-risk is defined
#' as what's on the reference genome (this may not always be the case).
#'
#' @inheritParams format_sumstats
#' @param log_files list of log file locations
#' @param standardise_headers Run
#' \code{standardise_sumstats_column_headers_crossplatform} first.
#'
#' @return A list containing two data tables:
#' \itemize{
#' \item \code{sumstats_dt}: the modified summary statistics
#' \code{data.table} object.
#' \item \code{rsids}: snpsById, filtered to SNPs of interest if
#' loaded already. Or else NULL.
#' \item \code{log_files}: log file list
#' }
#' @keywords internal
#' @import R.utils
#' @importFrom data.table setkey
#' @importFrom data.table :=
#' @importFrom data.table setnames
#' @importFrom data.table set
#' @importFrom data.table setorder
#' @importFrom data.table copy
check_allele_flip <- function(sumstats_dt, path,
ref_genome, rsids,
allele_flip_check,
allele_flip_drop,
allele_flip_z,
allele_flip_frq,
bi_allelic_filter,
flip_frq_as_biallelic,
imputation_ind,
log_folder_ind,
check_save_out,
tabix_index,
nThread,
log_files,
standardise_headers = FALSE,
mapping_file,
dbSNP) {
# # GenomicSEM' allele flipping strategy:
# file.path("https://github.com/GenomicSEM/GenomicSEM/blob",
# "fc8f17a817a8022d6900acf41824d27b3676f9c4/R/munge.R#L151")
# #example
# path <- system.file("extdata","eduAttainOkbay.txt",
# package="MungeSumstats")
# sumstats_dt <- read_sumstats(path = path)
# sumstats_return <- check_allele_flip(sumstats_dt = sumstats_dt,
# path=path,
# ref_genome="GRCh37",
# rsids=NULL,
# allele_flip_check=TRUE,
# standardise_headers=TRUE)
## Set variables to be used in in place data.table functions to NULL
## to avoid confusing BiocCheck.
SNP <- i.seqnames <- CHR <- BP <- i.pos <- LP <- P <- A1 <- A2 <- eff_i <-
i.A1 <- i.A2 <- ss_A1 <- ss_A2 <- i.ref_allele <-
ref_gen_allele <- match_type <-
tmp <- flipped <- NULL
if (standardise_headers) {
sumstats_dt <-
standardise_sumstats_column_headers_crossplatform(
sumstats_dt = sumstats_dt,
mapping_file = mapping_file
)[["sumstats_dt"]]
}
# If SNP present but no A1/A2 then need to find them
col_headers <- names(sumstats_dt)
if (sum(c("A1", "A2") %in% col_headers) == 2 && allele_flip_check) {
message(
"Checking for correct direction of A1 (reference) ",
"and A2 (alternative allele)."
)
# check if rsids loaded if not do so
if (is.null(rsids)) {
rsids <-
load_ref_genome_data(
data.table::copy(sumstats_dt$SNP),
ref_genome = ref_genome,
dbSNP = dbSNP, msg=NULL
)
}
# ensure rsids is up-to-date with filtered sumstats_dt
rsids <- rsids[unique(sumstats_dt$SNP), , nomatch = NULL]
data.table::setkey(rsids, SNP)
data.table::setkey(sumstats_dt, SNP)
# Append reference genome to data and check matches to both A1 or A2
# For each SNP, if ref genome matches A1, leave it (TRUE)
# if ref genome matches A2, flip it (FALSE)
# if ref genome doesn't match A1 or A2, leave it (TRUE)
sumstats_dt[rsids, ref_gen_allele := i.ref_allele]
sumstats_dt[is.na(ref_gen_allele), match_type := TRUE]
sumstats_dt[A1 == ref_gen_allele, match_type := TRUE]
sumstats_dt[A2 == ref_gen_allele, match_type := FALSE]
print(sumstats_dt)
# drop cases that don't match either
if (allele_flip_drop &&
nrow(sumstats_dt[A1 != ref_gen_allele &
A2 != ref_gen_allele, ]) > 0) {
print_msg0 <-
paste0(
"There are ",
formatC(nrow(sumstats_dt[A1 != ref_gen_allele &
A2 != ref_gen_allele, ]), big.mark = ","),
" SNPs where neither A1 nor A2 match the reference genome.",
" These will be removed."
)
message(print_msg0)
# If user wants log, save it to there
if (log_folder_ind) {
name <- "alleles_dont_match_ref_gen"
name <- get_unique_name_log_file(
name = name,
log_files = log_files
)
write_sumstats(
sumstats_dt = sumstats_dt[(A1 != ref_gen_allele &
A2 != ref_gen_allele), ],
save_path =
paste0(
check_save_out$log_folder,
"/", name,
check_save_out$extension
),
sep = check_save_out$sep,
#don't tab indx as could be miss values & cause err
#tabix_index = tabix_index,
nThread = nThread
)
log_files[[name]] <-
paste0(
check_save_out$log_folder, "/", name,
check_save_out$extension
)
}
sumstats_dt <- sumstats_dt[!(A1 != ref_gen_allele &
A2 != ref_gen_allele), ]
} else {
sumstats_dt[
A1 != ref_gen_allele & A2 != ref_gen_allele,
match_type := TRUE
]
}
# continue if flipping necessary
if (nrow(sumstats_dt[match_type == FALSE, ]) > 0) {
print_msg <- paste0(
"There are ",
formatC(nrow(sumstats_dt[match_type == FALSE, ]),
big.mark = ","),
" SNPs where A1 doesn't match the reference genome.",
"\nThese will be flipped with their effect columns."
)
message(print_msg)
# swap A1 and A2 for those SNPs needing to be flipped
sumstats_dt[match_type == FALSE, tmp := A2]
sumstats_dt[match_type == FALSE, A2 := A1]
sumstats_dt[match_type == FALSE, A1 := tmp]
sumstats_dt[, tmp := NULL]
# flip effect column(s) - BETA, OR, Z, log_odds, SIGNED_SUMSTAT, FRQ
effect_columns <- c("BETA", "OR", "LOG_ODDS", "SIGNED_SUMSTAT")
if (allele_flip_z) {
effect_columns <- c(effect_columns, "Z")
}
if (allele_flip_frq) {
effect_columns <- c(effect_columns, "FRQ")
}
effect_columns <-
effect_columns[effect_columns %in% names(sumstats_dt)]
# if FRQ present and needs to be flipped for SNPs ensure
# bi_allelic_filter is TRUE
stp_msg <- paste0(
"Certain SNPs need to be flipped along with their ",
"effect columns and frequency column. However to flip ",
"the\nFRQ column, only bi-allelic SNPs can be ",
"considered. It is recommended to set ",
"bi_allelic_filter to TRUE so\nnon-bi-allelic SNPs are",
" removed. Otherwise, set allele_flip_frq to FALSE to ",
"not flip the FRQ column but note\nthis could lead to ",
"incorrect FRQ values.\nA new option added is to flip ",
"non-bi-allelic SNPs as if they were bi-allelic (1-FRQ), ",
"to do this set\nflip_frq_as_biallelic to TRUE but note ",
"these values will not be exact down to the missing ",
"frequencies of the other\nalternative allele(s)."
)
if (nrow(sumstats_dt[match_type == FALSE, ]) > 0 &&
"FRQ" %in% effect_columns &&
!bi_allelic_filter && !flip_frq_as_biallelic) {
stop(stp_msg)
}
#if set flip_frq_as_biallelic = TRUE, let them know
if (nrow(sumstats_dt[match_type == FALSE, ]) > 0 &&
"FRQ" %in% effect_columns &&
!bi_allelic_filter && flip_frq_as_biallelic) {
print_msg <- paste0(
"Note: You have set flip_frq_as_biallelic to TRUE meaning of ",
"the ",
formatC(nrow(sumstats_dt[match_type == FALSE, ]),
big.mark = ","),
" SNPs to be flipped, ",
"any non-bi-allelic SNPs\nwill have their frequencies flipped ",
"as (1-FRQ) essentially ignoring the frequencies of any other ",
"alternative alleles."
)
message(print_msg)
}
for (eff_i in effect_columns) { # set updates quicker for DT
# conversion done in case, VCF beta column may not be numeric
if (eff_i == "FRQ") {
sumstats_dt[
match_type == FALSE,
(eff_i) := (1 - as.numeric(get(eff_i)))
]
} else if(eff_i == "OR"){
sumstats_dt[
match_type == FALSE,
(eff_i) := (1/as.numeric(get(eff_i)))
]
}
else {
sumstats_dt[
match_type == FALSE,
(eff_i) := as.numeric(get(eff_i)) * -1
]
}
}
if (imputation_ind) {
sumstats_dt[match_type == FALSE, flipped := TRUE]
}
}
# remove extra created columns and return
sumstats_dt[, ref_gen_allele := NULL]
sumstats_dt[, match_type := NULL]
return(list(
"sumstats_dt" = sumstats_dt,
"rsids" = rsids, "log_files" = log_files
))
}
return(list(
"sumstats_dt" = sumstats_dt,
"rsids" = rsids, "log_files" = log_files
))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.