#' @title Format GWAS summary data.
#' @description Reads in GWAS summary data. Infer Zscores from p-values and signed satatistics.
#' This function is adapted from the format_data() function in MRCIEU/TwoSampleMR.
#'
#' @md
#' @param dat Data frame. Must have header with at least SNP A1 A2 signed statistics pvalue and sample size.
#' @param snps.merge Data frame with SNPs to extract. must have headers: SNP A1 and A2. For example, the hapmap3 SNPlist.
#' @param snps.remove a set of SNPs needed to be removed. For example, the SNPs in MHC region.
#' @param snp_col column with SNP rs IDs. The default is `"SNP"`.
#' @param b_col Name of column with effect sizes. The default is `"b"`.
#' @param or_col: Name of column with odds ratio. The default is `or`.
#' @param se_col Name of column with standard errors. The default is `"se"`.
#' @param freq_col Name of column with effect allele frequency. The default is `"freq"`.
#' @param A1_col Name of column with effect allele. Must contain only the characters "A", "C", "T" or "G". The default is `"A1"`.
#' @param A2_col Name of column with non effect allele. Must contain only the characters "A", "C", "T" or "G". The default is `"A2"`.
#' @param p_col Name of column with p-value. The default is `"pval"`.
#' @param ncase_col Name of column with number of cases. The default is `"ncase"`.
#' @param ncontrol_col Name of column with number of controls. The default is `"ncontrol"`.
#' @param n_col Name of column with sample size. The default is `"n"`.
#' @param z_col Name of column with Zscore. The default is `"z"`.
#' @param info_col Name of column with inputation Info. The default is `"info_col"`.
#' @param log_pval The pval is -log10(p_col). The default is `FALSE`.
#' @param min_freq SNPs with allele frequecy less than min_freq will be removed.The default is `0.01`
#' @param n Sample size
#' @param n_qc Whether to remove SNPs according to the sample size of SNPs. The default is `FALSE`.
#'
#' @export
#' @return data frame wih headers: SNP: rsid; A1: effect allele; A2: non effect allel;
#' Z: Z score; N: sample size; chi2: chi square statistics; P: p-value.
#' @importFrom stats pnorm
format_data <- function(dat,
snps.merge = w_hm3.snplist,
snps.remove = MHC.SNPs,
snp_col="SNP",
b_col="b",
or_col ="or",
se_col="se",
freq_col="freq",
A1_col="effect_allele",
A2_col="other_allele",
p_col="pval",
ncase_col="ncase",
ncontrol_col="ncontrol",
n_col="N",
n=NULL,
z_col="z",
info_col="info",
log_pval=FALSE,
n_qc=F,
min_freq=0.01)
{
message("Begin formatting .... ")
message("The raw dataset has ", nrow(dat), " dat lines.")
cols = c(snp_col, b_col, or_col, se_col, freq_col, A1_col, A2_col, p_col, ncase_col, ncontrol_col, n_col,z_col,info_col)
dat = dat[, names(dat) %in% cols]
if(! snp_col %in% names(dat)){
stop("SNP column not found")
}
if(! A1_col %in% names(dat) | (!A2_col %in% names(dat))){
stop("A1/A2 columns not found")
}
if((!b_col %in% names(dat)) & (!or_col %in% names(dat)) & (!z_col %in% names(dat))){
stop("signed statistics not found")
}
if((!n_col %in% names(dat)) & (!(ncase_col %in% names(dat) & ncontrol_col %in% names(dat))) & is.null(n)){
stop("Information for sample size not found")
}
names(dat)[which(names(dat) == snp_col)[1]] <- "SNP"
dat$SNP <- tolower(dat$SNP)
dat$SNP <- gsub("[[:space:]]", "", dat$SNP)
dat <- subset(dat, !is.na(SNP))
if(!is.null(snps.remove)){
message("Remove SNPs in a given SNP set(default: SNPs in MHC region)...")
dat <- subset(dat, !SNP %in% snps.remove)
}
if(!is.null(snps.merge)){
snps.merge = snps.merge[, c("SNP","A1","A2")]
colnames(snps.merge) = c("SNP","ref.A1", "ref.A2")
message("Merge SNPs with a given snplist(default: hapmap3 snplist)...")
dat <- merge(dat, snps.merge, by="SNP")
}
if(info_col %in% names(dat)){
names(dat)[which(names(dat) == info_col)[1]] <- "info"
message("Remove SNPs with imputation info less than 0.9 ...")
dat <- subset(dat, info > 0.9)
}
# Check effect_allele (A1)
if(A1_col %in% names(dat)){
names(dat)[which(names(dat) == A1_col)[1]] <- "A1"
if(is.logical(dat$A1)){
dat$A1 <- substr(as.character(dat$A1), 1, 1)
}
if(!is.character(dat$A1)){
message("effect_allele column is not character data. Coercing...")
dat$A1 <- as.character(dat$A1)
}
dat$A1 <- toupper(dat$A1)
index = !grepl("^[ACTG]+$", dat$A1)
if(any(index)){
message("effect_allele column has some values that are not A/C/T/G. Remove these SNPs...")
dat = dat[-index,]
}
}
# Check effect_allele (A2)
if(A2_col %in% names(dat)){
names(dat)[which(names(dat) == A2_col)[1]] <- "A2"
if(is.logical(dat$A2))
{
dat$A2 <- substr(as.character(dat$A2), 1, 1)
}
if(!is.character(dat$A2))
{
message("other_allele column is not character data. Coercing...")
dat$A2 <- as.character(dat$A2)
}
dat$A2 <- toupper(dat$A2)
index = !grepl("^[ACTG]+$", dat$A2)
if(any(index)){
message("other allele column has some values that are not A/C/T/G. Remove these SNPs...")
dat = dat[-index,]
}
}
index = which((dat$A1=="A" & dat$A2=="T") |
(dat$A1=="T" & dat$A2=="A") |
(dat$A1=="C" & dat$A2=="G") |
(dat$A1=="G" & dat$A2=="C") |
(dat$A1=="A" & dat$A2=="A") |
(dat$A1=="T" & dat$A2=="T") |
(dat$A1=="C" & dat$A2=="C") |
(dat$A1=="G" & dat$A2=="G"))
if(any(index)){
message("Remove ambiguous SNPs ...")
dat = dat[-index,]
rm(index)
}
comple <- function(allele){
ifelse(allele == "A","T", ifelse(allele == "T","A", ifelse(allele == "G","C", ifelse(allele == "C","G", allele)) ))
}
index = which(!((dat$A1 == dat$ref.A2 & dat$A2 == dat$ref.A1) |
(dat$A1 == dat$ref.A1 & dat$A2 == dat$ref.A2) |
(dat$A1 == comple(dat$ref.A2) & dat$A2 == comple(dat$ref.A1))|
(dat$A1 == comple(dat$ref.A1) & dat$A2 == comple(dat$ref.A2))))
if(any(index)){
message("Remove SNPs with alleles not matched with the hapmap3 snplist")
dat = dat[-index,]
rm(index)
}
# message("Remove Duplicated SNPs if have. Just keeping the first instance")
# dat = dat[unique(dat$SNP),] # can be faster
# Check effect size estimate (b)
# set b as log(or) of b is not available
if(! b_col %in% names(dat) & or_col %in% names(dat)){
message("infer b column from log(or)...")
dat$b = log(dat$or_col)
}
if(b_col %in% names(dat)){
names(dat)[which(names(dat) == b_col)[1]] <- "b"
if(!is.numeric(dat$b))
{
message("b column is not numeric. Coercing...")
dat$b <- as.numeric(dat$b)
}
dat = dat[is.finite(dat$b),]
}
# Check se
if(se_col %in% names(dat)){
names(dat)[which(names(dat) == se_col)[1]] <- "se"
if(!is.numeric(dat$se))
{
message("se column is not numeric. Coercing...")
dat$se <- as.numeric(dat$se)
}
dat = dat[is.finite(dat$se) & dat$se > 0,]
}
if(z_col %in% names(dat)){
names(dat)[which(which(names(dat) == z_col))[1]] <- "z"
}
# Check freq
if(freq_col %in% names(dat)){
names(dat)[which(names(dat) == freq_col)[1]] <- "freq"
if(!is.numeric(dat$freq))
{
message("freq column is not numeric. Coercing...")
dat$freq <- as.numeric(dat$freq)
}
# remove SNP with allele freqcy less than min_freq
dat = dat[dat$freq > min_freq & dat$freq < 1-min_freq, ]
}
if(ncase_col %in% names(dat)){
names(dat)[which(names(dat) == ncase_col)[1]] <- "ncase"
if(!is.numeric(dat$ncase))
{
message(ncase_col, " column is not numeric")
dat$ncase <- as.numeric(dat$ncase)
}
}
if(ncontrol_col %in% names(dat)){
names(dat)[which(names(dat) == ncontrol_col)[1]] <- "ncontrol"
if(!is.numeric(dat$ncontrol))
{
message(ncontrol_col, " column is not numeric")
dat$ncontrol <- as.numeric(dat$ncontrol)
}
}
if(n_col %in% names(dat)){
names(dat)[which(names(dat) == n_col)[1]] <- "n"
if(!is.numeric(dat$n)){
message(samplesize_col, " column is not numeric")
dat$n <- as.numeric(dat$n)
}
if("ncontrol" %in% names(dat) & "ncase" %in% names(dat)){
index <- is.na(dat$n) & !is.na(dat$ncase) & !is.na(dat$ncontrol)
if(any(index)){
message("Generating sample size from ncase and ncontrol")
dat$n[index] <- dat$ncase[index] + dat$ncontrol[index]
}
}
}else if("ncontrol" %in% names(dat) & "ncase" %in% names(dat)){
message("Generating sample size from ncase and ncontrol")
dat$n <- dat$ncase + dat$ncontrol
}
if(!"n" %in% names(dat) & (!is.null(n))){
message("Generating sample size from specified sample size")
dat$n = n
}
# Check pval
if(p_col %in% names(dat) & log_pval){
names(dat)[which(names(dat) == p_col)[1]] <- "p"
dat$p <- 10^-dat$p
}
if(p_col %in% names(dat)){
names(dat)[which(names(dat) == p_col)[1]] <- "p"
if(!is.numeric(dat$p)){
message("pval column is not numeric. Coercing...")
dat$p <- as.numeric(dat$pval)
}
message("Remove SNPs with p value < 0 or p value > 1")
dat = subset(dat, p>=0 & p <=1)
}
# change colnames to upcase
dat <- setNames(dat, toupper(names(dat)))
# calculate Z if not contain Z, but with b se or p and sign
if("Z" %in% names(dat)){
dat$chi2 = dat$Z^2
}
if((! "Z" %in% names(dat)) & ("B" %in% names(dat) & "SE" %in% names(dat))){
message("Infer Z score from b/se ...")
dat$Z = dat$B/dat$SE
dat$chi2 = dat$Z^2
}
# calculate Z from p value
if("P" %in% names(dat)){
if("B" %in% names(dat) & ! "Z" %in% names(dat)){
dat$chi2 = qchisq(dat$P,1,lower.tail = F)
message("Infer Z score from p value and b ...")
dat$Z = sign(dat$B)* sqrt(dat$chi2)
}
}
# calculate chi2 if no chi2
if(!"chi2" %in% names(dat)) dat$chi2 = dat$Z^2
# calculate P if not contain P
if(!"P" %in% names(dat)) dat$P = pchisq(dat$chi2, 1, lower.tail = F)
if(! "Z" %in% names(dat)){
stop("Error: No information for Zscore ")
}
if(n_qc==T){
n_min = quantile(dat$n, 0.9)/1.5
message("Remove SNPs with sample size <= n_min... ")
dat = subset(dat, N > n_min)
}
message("Remove SNPs with chi2 > max(n/1000,80)... ")
dat = subset(dat, chi2 < max(median(dat$N)/1000,80))
message("The formatted data has ", nrow(dat), " dat lines. \n")
return(dat[, c("SNP","A1","A2","Z","N","chi2","P")])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.