#####################################################
# This script implements the VT test for pooled association of rare variants with a phenotype.
# See Price et al. AJHG 2010
# The script also includes implementations of T1, T5, and WE (Madsen-Browning) tests, optionally weighted with PolyPhen scores.
# (see the paper for details).
#
# For speed, the script supports three modes of running: local on single CPU, multicore, and cluster (see options below).
#
# NOTE: currently, the script is configured to run the tests on a single gene.
#
# Usage Rscript rareVariantTests.R -p <permutations> -n <nodes> -a <phenotypeFile> -b <snpWeightFile> -c <genotypeFile> [--multicore] [--seed seed]
# <permutations> is an integer number of permutations to perform
# <nodes> is an integer number of nodes in the SunGridEngine to use (set 0 to run the script locally, without a cluster)
# <phenotypeFile> is the name of a file with lines of format: individualID phenotypeValue
# <snpWeightFile> is the name of a file with lines of format (weight is between 0 and 1): snpid weight
# <genotypeFile> is the name of a file with lines of format (genotype is the number of rare alleles of the snp in the individual, typically one of {0,1,2}): individualID snpid genotype
# --multicore is an optional flag that indicates that mutliple CPUs of the machine can be used for computations (using this flag implies that the cluster is not to be used)
# <seed> is an optional random seed value
#
# Example 1 (run on multicore):
# Rscript --slave rareVariantTests.R -p 100000 -n 0 -a AHITUV/data1.ind0 -b AHITUV/data1.csnp0 -c AHITUV/data1.cgeno0 --multicore
#
# Example 2 (run on a single CPU):
# Rscript --slave rareVariantTests.R -p 100000 -n 0 -a AHITUV/data1.ind0 -b AHITUV/data1.csnp0 -c AHITUV/data1.cgeno0
#
# Example 3 (run on Sun Grid Engine cluster - split the permutations into 50 parts):
# Rscript --slave rareVariantTests.R -p 100000 -n 50 -a AHITUV/data1.ind0 -b AHITUV/data1.csnp0 -c AHITUV/data1.cgeno0
#
# Output: p-values for
# score1, score1P - test T1 and T1P (see paper)
# score2, score2P - test T5 and T5P (see paper)
# score3, score3P - test WE and WEP (see paper)
# score4, score4P - test VT and VTP (see paper)
#
# This R implementation by Adam Kiezun, based on reference implementation in C by Alkes Price.
#######################################################
##suppressPackageStartupMessages(library(getopt))
#suppressPackageStartupMessages(library(Rsge))
suppressPackageStartupMessages(library(doMC))
suppressPackageStartupMessages(library(pryr))
suppressPackageStartupMessages(library(parallel))
vt.fast <- function(ind, csnp, cgeno, flipPhenotype=F, permutations=1000, nodes=48, multicore=1, seed=0) {
#Sometimes phenotypes are annotated the opposite way of what we're expecting. If yes, then flip.
ind <- ind[,c("indid", "pheno")]
if (flipPhenotype){
cat("flipping phenotypes\n")
ind$pheno <- -1.0*ind$pheno
}
meanpheno <- mean(ind$pheno)
#For each SNP, how many times it is seen
csnp$counts <- sapply(csnp$snpid, function(x){ sum(cgeno[cgeno$snpid == x,]$count) })
#For a SNP with total count c, how many counts are lower than c? (ie what is the rank or c in the order of counts)
csnp$countg <- sapply(csnp$counts, function(x){ length(unique(csnp[csnp$counts < x,]$counts)) })
#Sample size
N <- dim(ind)[1]
m1 <- merge(cgeno, csnp, by=c("snpid"))
#adjust polyphen scores
if (length(m1[(m1$counts >= N/50) & (m1$polyphen < 1.0),]$polyphen) > 0){
m1[(m1$counts >= N/50) & (m1$polyphen < 1.0),]$polyphen <- 0.5
}
#pre-compute metrics that are independent of permutations
m1$countSquare <- m1$count*m1$count
m1$countPolyphen <- m1$count*m1$polyphen
m1$countSquarePolyphenSquare <- m1$countPolyphen*m1$countPolyphen
f <- (1+m1$counts)/(2+2*N)
m1$weight <- 1/sqrt(f*(1.0-f))
m1$countWeight <- m1$count*m1$weight
#Create a single table by joining SNPs and genotypes by SNPid, and joining individuals by individual ID
m <- merge(m1, ind, by=c("indid"))
m <- m[m$counts < N,] #ignore common variants
#Compute sum for subsets of indices (the subsets are pre-computed)
mysum <- function(X, range, arr, whiches){
for (i in range) { arr[i] <- sum(X[whiches[[i]]])}
arr
}
#To improve speed, pre-compute everything that is independent of permutations
ctg <- m$countg
mCount <- m$count;
mPolyphen <- m$polyphen;
mCountWeight <- m$countWeight
indPheno <- ind$pheno
nx <- length(ctg)
fctg <- as.factor(list(ctg)[[1]])
index <- fctg
one <- 1L
group <- rep.int(one, nx) + one * (as.integer(index) - one)
len <- length(unique(group))
arr <- double(len)
oneToLen <- 1:len
Msize <- dim(m)[1]
whiches <- vector("list", len)
for (i in oneToLen) { whiches[[i]] <- which(group == i) }
count <- mysum(mCount, range=oneToLen, arr=arr, whiches=whiches)
countSquare <- mysum(m$countSquare, range=oneToLen, arr=arr, whiches=whiches)
countSquarePolyphenSquare <- mysum(m$countSquarePolyphenSquare, range=oneToLen, arr=arr, whiches=whiches)
countPolyphen <- mysum(m$countPolyphen, range=oneToLen, arr=arr, whiches=whiches)
#Indices of variants below frequency thresholds
mBelow50 <- which(m$counts < N/50)
mBelow10 <- which(m$counts < N/10)
#Pre-compute cumulative sums, for VT test
csCount <- cumsum(count)
csCountSquare <- cumsum(countSquare)
csCountPolyphenMeanpheno <- cumsum(countPolyphen * meanpheno)
csCountSquarePolyphenSquare <- cumsum(countSquarePolyphenSquare)
csCountMeanpheno <- csCount*meanpheno
sqrtCsCountSquare <- sqrt(csCountSquare)
sqrtCsCountSquarePolyphenSquare <- sqrt(csCountSquarePolyphenSquare)
#Indices of individuals from m in ind (may be duplicate)
matchIds <- match(m$indid, ind$indid)
#Compute the test scores for many tests, for 1 permutation
getScores <- function(permute){
if (permute){
pheno <- sample(ind$pheno)[matchIds]
} else {
pheno <- ind$pheno[matchIds]
}
phenoCount <- pheno * mCount
phenoCountPolyphen <- phenoCount * mPolyphen
phenoCountWeight <- pheno * mCountWeight
#Scores that count only rare variants, optionally weighted
#score1 <- sum(phenoCount[mBelow50])
#score2 <- sum(phenoCount[mBelow10])
#score1P <- sum(phenoCountPolyphen[mBelow50])
#score2P <- sum(phenoCountPolyphen[mBelow10])
#Madsen-Browning score, optionally weighted
#score3 <- sum(phenoCountWeight)
#score3P <- sum(phenoCountWeight * mPolyphen)
#VT test, optionally weighted
#Aggregate for each count, to find the optimal threshold for VT test
csPhenoCount <- cumsum(mysum(phenoCount, range=oneToLen, arr=arr, whiches=whiches))
csPhenoCountPolyphen <- cumsum(mysum(phenoCountPolyphen, range=oneToLen, arr=arr, whiches=whiches))
score4 <- max((csPhenoCount-csCountMeanpheno)/sqrtCsCountSquare)
score4P <- max((csPhenoCountPolyphen-csCountPolyphenMeanpheno)/sqrtCsCountSquarePolyphenSquare)
#c(score1=score1, score1P=score1P, score2=score2, score2P=score2P, score3=score3, score3P=score3P, score4=score4, score4P=score4P)
c(score4P=score4P, score4=score4)
}
#Unpermuted data for which we're looking for pvalues
unpermutedScores <- as.data.frame(t(getScores(permute=FALSE)))
#For a specific score, returns how often permuted data has a higher score than unpermuted data.
permwins <- function(scores.df, scorename) {
unpermuted <- (unpermutedScores[c(scorename)])[[1]]
ceiling(sum(scores.df[,c(scorename)] > unpermuted) + 0.5*sum(scores.df[,c(scorename)] == unpermuted))
}
#For all scores, returns how often permuted data has a higher score than unpermuted data.
getPermWins <- function(subrange, seed){
set.seed(seed)
scores <- sapply(X = subrange, simplify = T, USE.NAMES = T, FUN = function(x) { getScores(permute=TRUE) })
scores.df <- as.data.frame(t(scores))
#pw1 <- permwins(scores.df, "score1")
#pw1P <- permwins(scores.df, "score1P")
#pw2 <- permwins(scores.df, "score2")
#pw2P <- permwins(scores.df, "score2P")
#pw3 <- permwins(scores.df, "score3")
#pw3P <- permwins(scores.df, "score3P")
pw4 <- permwins(scores.df, "score4")
pw4P <- permwins(scores.df, "score4P")
#c(score1=pw1, score1P=pw1P, score2=pw2, score2P=pw2P, score3=pw3, score3P=pw3P, score4=pw4, score4P=pw4P)
c(score4P=pw4P)
}
#P-values
pval <- function(permwins){ (permwins+1)/(permutations+1) }
#Splits the range of all permutations to k parts
splitRanges <- function(permutations, k) {
i <- 1:permutations
if (k == 0)
list(i)
else
structure(split(i, cut(i, k)), names = NULL)
}
if (is.null(multicore)){
#Running on the cluster
cluster <- nodes != 0
#cat("running ", permutations, " permutations on ", nodes, "nodes " , " cluster=", cluster, "\n")
options(sge.user.options = paste(getOption("sge.user.options"), " -p -1", sep="")) #SGE job priorities
subranges <- splitRanges(permutations, nodes)
pw <- sge.parSapply(cluster=cluster, global.savelist=c("matchIds", "opt", "csCount", "csCountSquare", "csCountPolyphenMeanpheno", "csCountSquarePolyphenSquare", "csCountMeanpheno", "sqrtCsCountSquare", "sqrtCsCountSquarePolyphenSquare","indPheno", "Msize", "mCount", "mPolyphen", "mCountWeight", "mBelow50", "mBelow10", "mysum", "oneToLen", "whiches", "arr", "m", "meanpheno", "N", "csnp", "ind", "cgeno", "subranges", "getScores", "permwins", "unpermutedScores"), function.savelist=c("getPermWins"), njobs=length(subranges), 1:length(subranges), function(i) { getPermWins(subranges[[i]], seed=seed+i)})
pw.df <- as.data.frame(t(pw))
pw.df <- as.data.frame(t(apply(pw.df, 2, sum)))
} else {
#Running multicore
#XXX: detectCores is an internal function of the multicores package
cores <- nodes
#cores <- detectCores()
#cat("running ", permutations, " permutations on ", cores, " cores ", "\n")
registerDoMC()
subranges <- splitRanges(permutations, cores)
pw <- foreach(i=1:cores, .combine='+') %dopar% getPermWins(subranges[[i]], seed=seed+i)
pw.df <- as.data.frame(t(pw))[1,]
}
return(pval(pw.df))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.