R/load.summary.R
In MultiABEL: Multi-Trait Genome-Wide Association Analysis

#' Loading multiple summary statistics from genome-wide association studies 
#' 
#' The function loads multiple meta-GWAS summary statistics, for subsequent multi-trait GWAS. 
#' Currently, the package only analyzes summary statistics from inverse-Gaussianized continuous traits.
#' 
#' @param files A vector of file names as strings. Each file name should contain summary statistics of
#' one trait to be included in the multi-trait analysis. The columns of the summary statistics have to
#' contain (uppercase or lowercase does not matter) \code{'snp'} (marker ID), \code{'a1'} (the first allele),
#' \code{'a2'} (the second allele), \code{'freq'},
#' (frequency of the first allele), \code{'beta'} (effect size), \code{'se'} (standard error), and 
#' \code{'n'} (sample size).
#' @param cor.pheno A #traits x #traits matrix of correlation matrix of the phenotypes, to be used to 
#' construct the multi-trait test statistic. If \code{NULL},
#' this matrix will be estimated from genome-wide summary statistics. If you have partially overlapping 
#' samples for different traits, shrinkage correlation matrix is recommended (see reference), so in that
#' case, unless you know what you are doing, leave this argument as default, i.e. \code{NULL}.
#' @param indep.snps A vector of strings containing the names of a set of independent SNPs. This is 
#' recommended to be generated by LD-pruning the genotype data in a certain cohort. Typically the 
#' number of SNPs should be more than 10,000 in order to obtain a good estimate of \code{cor.pheno}. If
#' \code{cor.pheno = NULL}, this argument cannot be \code{NULL}.
#' @param est.var A logical value. If \code{FALSE}, each phenotypic variance is assumed to be known as 1. 
#' If \code{TRUE}, each phenotypic variance will be estimated to adjust the summary statistics, so that
#' the corresponding phenoypic variance is 1.
#' @param columnNames A vector with names of columns containing necessary information in the input file;
#' default values are c('snp','a1','a2','freq','beta','se','n'). The values are case-insensitive. Note: check
#' your allele definitions for different traits are based on the same strand!
#' @param fixedN sample size to assume across all analyses, when provided, this number will be used
#' (instead of the ones specified in the input files)
#' 
#' @return The function returns a list of class \code{multi.summary}, containing two elements: \code{gwa}
#' (the cleaned data to be processed in multi-trait GWAS), \code{cor.pheno} (user input or estimated), and 
#' \code{var.pheno} (default or estimated).
#' 
#' @author Xia Shen, Yakov, Tsepilov, Yurii S. Aulchenko
#' 
#' @references 
#' Xia Shen, Yakov Tsepilov, ..., Yudi Pawitan, Chris S. Haley, Yurii S. Aulchenko (2017).
#' Discovery, replication, and in silico functional investigation of 
#' 22 new pleiotropic anthropometric loci. \emph{Submitted}.
#' 
#' @seealso 
#' \code{MultiSummary}
#' 
#' @examples 
#' \dontrun{
#' ## download the six example files from:
#' ## https://www.dropbox.com/sh/hhta45cewvvea2s/AADfj4OXlbroToZAwIii2Buha?dl=0
#' ## the summary statistics from Randall et al. (2013) PLoS Genet
#' ## for males only
#' ## bmi: body mass index
#' ## hip: hip circumference
#' ## wc: waist circumference
#' ## whr: waist-hip ratio
#' 
#' ## load the prepared set of independent SNPs
#' indep.snps <- as.character(read.table('indep.snps')$V1)
#' 
#' ## load summary statistics of the six traits
#' stats.male <- load.summary(files = list.files(pattern = '*.txt'), indep.snps = indep.snps)
#' 
#' ## perform multi-trait meta-GWAS
#' result <- MultiSummary(stats.male)
#' head(result)
#' }
#' @aliases load.summary
#' @keywords multivariate, meta-analysis
#' 
`load.summary` <- function(files, cor.pheno = NULL, indep.snps = NULL, 
		est.var = FALSE, 
		columnNames = c('snp', 'a1', 'a2', 'freq', 'beta', 'se', 'n'),
		fixedN = NULL) {
	
	### I. Sanity checks ###
	
	# require(data.table)
	
	if (!all(is.character(files))) {
		stop('files should be given as strings!')
	}
	
	if (sum(file.exists(files)) < 2) {
		stop('number of traits has to be more than 2!')
	}
	
	if (is.null(cor.pheno) & is.null(indep.snps)) {
		stop('indep.snps required for estimating cor.pheno!')
	}
	
	m <- length(files)
	if (!is.null(cor.pheno)) {
		if (nrow(cor.pheno) != m | ncol(cor.pheno) != m) {
			stop('wrong dimensions of cor.pheno!')
		}
	}
	
	columnNames <- tolower(columnNames)
	
	if (!is.null(fixedN)) if (fixedN <= 0) {
			stop('fixedN should be a positive number')
		}
	
	if (is.null(fixedN)) { colNamLen = 7 } else { colNamLen = 6 }
	
	if (!is.character(columnNames)) {
		stop('columnNames should be character')
	}
	
	if (length(columnNames) != colNamLen) {
		cat('columnNames should be a vector with',colNamLen,'elements')
		stop('... exiting')
	}
	
	if (length(unique(columnNames)) != colNamLen) {
		stop('elements of columnNames must be unique')
	}
	
	cNT = list(
			'snp' = columnNames[1],
			'a1'  = columnNames[2],
			'a2' = columnNames[3],
			'freq'= columnNames[4],
			'beta'= columnNames[5],
			'se'  = columnNames[6],
			'n'   = columnNames[7]
	)
	
	### II. Data load ###
	
	cat("loading data ...\n")
	
	data <- c()
	fn <- files # rev(files)
	vys <- rep(1, m)
	for (i in m:1) {
		
		# Reserved for possible problems with data.table package
		# dd <- read.table(fn[i], header = TRUE, stringsAsFactors = FALSE)
		
		dd <- data.table::fread(fn[i], header = TRUE, stringsAsFactors = FALSE,verbose = FALSE, showProgress = FALSE, data.table = FALSE)
		
		colnames(dd) <- tolower(colnames(dd))
		currentColNames <- colnames(dd)
		if (any(!(columnNames %in% currentColNames))) {
			stop('file column names do not match columnNames in ', fn[i], '... exiting!')
		}
		idx <- which(duplicated(dd[, cNT$snp]))
		if (length(idx) > 0) {
			data[[i]] <- dd[-idx,]
			rownames(data[[i]]) <- dd[ -idx , cNT$snp]
		} else {
			data[[i]] <- dd
			rownames(data[[i]]) <- dd[, cNT$snp]
		}
		if (est.var) {
			if (is.null(fixedN)) {
				D <-  dd[,cNT$n]*2*dd[, cNT$freq]*(1 - dd[, cNT$freq])
				vy <- D*dd[ , cNT$se]**2 + D*dd[, cNT$beta]**2/(dd[, cNT$n] - 1)
			} else {
				D <-  fixedN*2*dd[, cNT$freq]*(1 - dd[, cNT$freq])
				vy <- D*dd[ , cNT$se]**2 + D*dd[, cNT$beta]**2/(fixedN - 1)
			}
			dvy <- density(na.omit(vy))
			vys[i] <- dvy$x[which.max(dvy$y)] #median(vy, na.rm = TRUE)
		}
		progress((m - i + 1)/m*100)
	}
	cat('\n')
	if (est.var) cat('phenotypic variances are:', vys, '\n')
	
	### III. SNP cheking and cleaning ###
	
	cat("checking markers ...\n")
	
	snps <- data[[1]][,cNT$snp]
	for (i in 1:m) {
		data[[i]]=na.omit(data[[i]][,unlist(cNT)])
		snps=intersect(snps,data[[i]][, cNT$snp])
		progress(i/m * 100)
	}
	
	snps <- unique(snps)
	cat("\n")
	
	cat("cleaning data ...\n")
	for (i in 1:m) {
		data[[i]] <- data[[i]][snps, ]
		progress(i/m * 100)
	}
	cat("\n")
	cat("correcting parameters ...\n")
	
	for (i in 1:m) {
		data[[i]][, cNT[["a1"]]] = toupper(data[[i]][, cNT[["a1"]]])
		data[[i]][, cNT[["a2"]]] = toupper(data[[i]][, cNT[["a2"]]])
	}
	
	d1a12 <- paste(data[[1]][,c(cNT$a1)],data[[1]][,c(cNT$a2)], sep = "")
	d1a21 <- paste(data[[1]][,c(cNT$a2)],data[[1]][,c(cNT$a1)], sep = "")
	
	
	snps=rownames(data[[1]])
	
	i <- 2
	for (i in 2:m) {
		
		dia12 <- paste(data[[i]][,c(cNT$a1)],data[[i]][,c(cNT$a2)], sep = "")
		
		ind=which(((dia12==d1a12) + (dia12==d1a21))==1)
		snps=intersect(rownames(data[[i]])[ind],snps)
		
		if (any(data[[i]][, cNT[["a1"]]] != data[[1]][, cNT[["a1"]]])) {
			adj <- 2 * as.numeric(data[[i]][, cNT[["a1"]]] == data[[1]][, cNT[["a1"]]]) - 1
			data[[i]][, cNT$beta] <- data[[i]][, cNT$beta] *adj
			data[[i]][, cNT$freq] <- (adj == 1) * data[[i]][,cNT$freq] + (adj == -1) * (1 - data[[i]][, cNT$freq])
		}
		progress(i/m * 100)
	}
	
	for (i in 1:m) {
		data[[i]] <- data[[i]][snps, ]
		#progress(i/m * 100)
	}
	
	cat("\n")
	
	### IV. Sample size and finalisation ###
	
	cat("adjusting sample size ... ")
	n0 <- matrix(NA, nrow(data[[1]]), m)
	if (is.null(fixedN)) {
		for (i in 1:m) {
			n0[, i] <- data[[i]][, cNT$n]
		}
	}else {
		for (i in 1:m) {
			n0[, i] <- fixedN
		}
	}
	n <- apply(n0, 1, "min")
	cat("done.\n")
	
	cat('finalizing summary statistics ...\n')
	gwa0 <- matrix(NA, nrow(data[[1]]), 2*m + 2)
	for (i in 1:m) {
		gwa0[,i*2 - 1] <- data[[i]][,cNT$beta]
		gwa0[,i*2] <- data[[i]][,cNT$se]
		progress(i/m*100)
	}
	gwa0[,2*length(data) + 1] <- data[[1]][,cNT$freq]
	gwa0[,2*length(data) + 2] <- n
	rownames(gwa0) <- data[[1]][,cNT$snp]
	gwa0 <- na.omit(gwa0)
	cat('\n')
	
	if (is.null(cor.pheno)) {
		n.ratio <- diag(m)
		for (i in 1:(m - 1)) {
			for (j in (i + 1):m) {
				ratio <- mean(sqrt(n0[,j]/n0[,i]), na.rm = TRUE)
				n.ratio[i,j] <- n.ratio[j,i] <- ifelse(ratio > 1, 1/ratio, ratio)
			}
		}
		if (any(n.ratio < 1)) {
			cat('samples partially overlap!\n')
			cat('estimating shrinkage phenotypic correlations ... ')
		} else {
			cat('estimating phenotypic correlations ... ')
		}
		idx <- which(rownames(gwa0) %in% indep.snps)
		gwa1 <- gwa0[idx,]
		z <- gwa1[,seq(1, 2*m, 2)]/gwa1[,seq(2, 2*m, 2)]
		cor.pheno <- cor(z, use = 'pairwise.complete.obs')
		cat('done.\n')
	}
	
	dimnames(cor.pheno) <- list(files, files)
	gwanames <- c(paste(rep(files, each = 2), rep(c('.beta', '.se'), m), sep = ''), 'f', 'n')
	colnames(gwa0) <- gwanames
	
	A12 <- data[[1]][rownames(gwa0), c(cNT[["a1"]],cNT[["a2"]])]
	colnames(A12) <- c("A1","A2")
	
	dd <- list(gwa = gwa0, cor.pheno = cor.pheno, var.pheno = vys, alleles = A12)
	class(dd) <- "multi.summary"
	return(dd)
	
}