Nothing
VTWOD <-
function(x, y, polyphen.weight, flipPhenotype = 0,
npermutation = 1000, npermutation.max, min.nonsignificant.counts)
{
#####################################################
# This script implements the VT test for pooled association of rare variants with a phenotype.
# See Price et al. AJHG 2010
# The script also includes implementations of T1, T5, and WE (Madsen-Browning)
# tests, optionally weighted with PolyPhen scores.
# 2011 - included new test designed for two-tailed variants (WOD at 1% and 5%)
#
# For speed, the script supports three modes of running: local on single CPU,
# multicore, and cluster (see options below).
#
# NOTE: currently, the script is configured to run the tests on a single gene.
#
# <npermutation> is an integer number of npermutation to perform
# <phenotypeFile> is the name of a file with lines of format: individualID phenotypeValue
# <snpWeightFile> is the name of a file with lines of format (weight is between 0 and 1): snpid weight
# <genotypeFile> is the name of a file with lines of format (genotype is the number of rare alleles of the snp in the individual, typically one of {0,1,2}): individualID snpid genotype
# <seed> is an optional random seed value
#
# Output: p-values for
# score1, score1P - test T1 and T1P (see paper)
# score2, score2P - test T5 and T5P (see paper)
# score3, score3P - test WE and WEP (see paper)
# score4, score4P - test VT and VTP (see paper)
# WOD.01 - WOD method with 1% threshold
# WOD.05 - WOD method with 5% threshold
#
# This R implementation by Adam Kiezun, based on reference implementation in C by Alkes Price.
# Added WOD tests to the program in 2011 by Celia Greenwood
#######################################################
#
#source("E:/Changjiang Xu/raresnps/WOD_program/WOD.VariantTestsCJ.R")
#
#indFile <- "sample.pheno.txt" # indid, pheno
#snpFile <- "sample.polyphen2.weight" # polyphen scores-set to 0.5 if missing
#genoFile <- "sample.geno" # file with genotypes
# # indid, snpid, count(0,1,2) rare alleles
#flipPhenotype <- 0 # logical, should the phenotype be multiplied by -1
#npermutation <- 1000 # number of npermutation to perform
#seed <- 10985 # random seed, integer
#set.seed(seed)
#
# read data and remove problem individuals
#DIR<- "E:/Changjiang Xu/raresnps/WOD_program/"
#ind <- read.table(paste(DIR, indFile, sep=""), col.names=c("indid", "pheno"))
#csnp <- read.table(paste(DIR, snpFile, sep=""), col.names=c("snpid", "polyphen"))
#cgeno <- read.table(paste(DIR, genoFile, sep=""), col.names=c("indid", "snpid", "count"))
#source("E:/Changjiang Xu/raresnps/WOD_program/count2geno.R")
#source("E:/Changjiang Xu/raresnps/WOD_program/geno2count.R")
#geno<- count2geno(cgeno)
#cgeno<- geno2count(geno)
#
#######################################################
#Data formatting
if (is.null(colnames(x))) colnames(x)<- paste("snp", 1:ncol(x), sep="")
if (is.null(rownames(x))) rownames(x)<- 1:nrow(x)
cgeno <- geno2count(x)
ind <- data.frame(indid = rownames(x), pheno = y, stringsAsFactors = FALSE)
if (missing(polyphen.weight)) csnp<- data.frame(snpid = colnames(x), polyphen = rep(1, ncol(x)), stringsAsFactors = FALSE)
else {csnp<- as.data.frame(polyphen.weight, stringsAsFactors = FALSE)
colnames(csnp)<- c("snpid", "polyphen")
}
#permutation parameters
if (missing(min.nonsignificant.counts)) min.nonsignificant.counts<- 10 #?10, ?20
if (missing(npermutation.max)) npermutation.max<- npermutation
#######################################################
#The following codes are same with WOD.VariantTests.R except adding adaptive permutation.
#######################################################
#Sometimes phenotypes are annotated the opposite way of what we're expecting. If yes, then flip.
ind <- ind[,c("indid", "pheno")]
if (flipPhenotype){
cat("flipping phenotypes\n")
ind$pheno <- -1.0*ind$pheno
}
meanpheno <- mean(ind$pheno)
# VT code
#For each SNP, how many times it is seen
csnp$counts <- sapply(csnp$snpid, function(x){ sum(cgeno[cgeno$snpid == x,]$count) })
#For a SNP with total count c, how many counts are lower than c? (ie what is the rank or c in the order of counts)
csnp$countg <- sapply(csnp$counts, function(x){ length(unique(csnp[csnp$counts < x,]$counts)) })
#Sample size
N <- dim(ind)[1]
m1 <- merge(cgeno, csnp, by=c("snpid"))
#adjust polyphen scores
if (length(m1[(m1$counts >= N/50) & (m1$polyphen < 1.0),]$polyphen) > 0){
m1[(m1$counts >= N/50) & (m1$polyphen < 1.0),]$polyphen <- 0.5
}
#pre-compute metrics that are independent of npermutation
m1$countSquare <- m1$count*m1$count
m1$countPolyphen <- m1$count*m1$polyphen
m1$countSquarePolyphenSquare <- m1$countPolyphen*m1$countPolyphen
f <- (1+m1$counts)/(2+2*N)
m1$weight <- 1/sqrt(f*(1.0-f))
m1$countWeight <- m1$count*m1$weight
#Create a single table by joining SNPs and genotypes by SNPid, and joining individuals by individual ID
m <- merge(m1, ind, by=c("indid"))
m <- m[m$counts < N,] #ignore common variants
#Compute sum for subsets of indices (the subsets are pre-computed)
mysum <- function(X, range, arr, whiches){
for (i in range) { arr[i] <- sum(X[whiches[[i]]])}
arr
}
#To improve speed, pre-compute everything that is independent of npermutation
ctg <- m$countg
mCount <- m$count;
mPolyphen <- m$polyphen;
mCountWeight <- m$countWeight
indPheno <- ind$pheno
nx <- length(ctg)
fctg <- as.factor(list(ctg)[[1]])
index <- fctg
one <- 1L
group <- rep.int(one, nx) + one * (as.integer(index) - one)
len <- length(unique(group))
arr <- double(len)
oneToLen <- 1:len
Msize <- dim(m)[1]
whiches <- vector("list", len)
for (i in oneToLen) { whiches[[i]] <- which(group == i) }
count <- mysum(mCount, range=oneToLen, arr=arr, whiches=whiches)
countSquare <- mysum(m$countSquare, range=oneToLen, arr=arr, whiches=whiches)
countSquarePolyphenSquare <- mysum(m$countSquarePolyphenSquare, range=oneToLen, arr=arr, whiches=whiches)
countPolyphen <- mysum(m$countPolyphen, range=oneToLen, arr=arr, whiches=whiches)
#Indices of variants below frequency thresholds
mBelow50 <- which(m$counts < N/50)
mBelow10 <- which(m$counts < N/10)
#Pre-compute cumulative sums, for VT test
csCount <- cumsum(count)
csCountSquare <- cumsum(countSquare)
csCountPolyphenMeanpheno <- cumsum(countPolyphen * meanpheno)
csCountSquarePolyphenSquare <- cumsum(countSquarePolyphenSquare)
csCountMeanpheno <- csCount*meanpheno
sqrtCsCountSquare <- sqrt(csCountSquare)
sqrtCsCountSquarePolyphenSquare <- sqrt(csCountSquarePolyphenSquare)
#Indices of individuals from m in ind (may be duplicate)
matchIds <- match(m$indid, ind$indid)
# Items needed for CG's implementation of Liu & Leal
csnp$rare01 = (csnp$counts/(2*N)<0.01) # 1% threshold for rare
csnp$rare05 = (csnp$counts/(2*N)<0.05) # 5% threshold for rare
# find all individuals who carry no rare variants at 1% or 5%
ind$norare01 = sapply(ind$indid, function(x) {
all(is.na(match(m1[m1$indid==x,]$snpid, csnp$snpid[csnp$rare01] )))})
ind$norare05 = sapply(ind$indid, function(x) {
all(is.na(match(m1[m1$indid==x,]$snpid, csnp$snpid[csnp$rare05] )))})
################################################################
#Compute the test scores for many tests, for 1 permutation
getScores <- function(permute){
if (permute){
pheno.cg <- sample(ind$pheno)
pheno <- pheno.cg[matchIds]
} else {
pheno.cg <- ind$pheno
pheno <- pheno.cg[matchIds]
}
phenoCount <- pheno * mCount
phenoCountPolyphen <- phenoCount * mPolyphen
phenoCountWeight <- pheno * mCountWeight
#Scores that count only rare variants, optionally weighted
score1 <- sum(phenoCount[mBelow50])
score2 <- sum(phenoCount[mBelow10])
score1P <- sum(phenoCountPolyphen[mBelow50])
score2P <- sum(phenoCountPolyphen[mBelow10])
#Madsen-Browning score, optionally weighted
score3 <- sum(phenoCountWeight)
score3P <- sum(phenoCountWeight * mPolyphen)
#VT test, optionally weighted
#Aggregate for each count, to find the optimal threshold for VT test
csPhenoCount <- cumsum(mysum(phenoCount, range=oneToLen, arr=arr, whiches=whiches))
csPhenoCountPolyphen <- cumsum(mysum(phenoCountPolyphen, range=oneToLen, arr=arr, whiches=whiches))
score4 <- max((csPhenoCount-csCountMeanpheno)/sqrtCsCountSquare)
score4P <- max((csPhenoCountPolyphen-csCountPolyphenMeanpheno)/sqrtCsCountSquarePolyphenSquare)
# new code by CG for Liu & Leal extension
pheno.mean <- mean(pheno.cg[ind$norare01])
pheno.sd <- sd(pheno.cg[ind$norare01])
quant <- pnorm(pheno.cg, pheno.mean, pheno.sd)
probnonnull <- 1 - 2*(quant*(quant<0.5) + (1-quant)*(quant>=0.5))
stat.b <- t(abs(pheno.cg-meanpheno)) %*% probnonnull / sum(probnonnull)
WOD.01 <- -stat.b # do this because score expected to get larger
# under permuted null.
pheno.mean <- mean(pheno.cg[ind$norare05])
pheno.sd <- sd(pheno.cg[ind$norare05])
quant <- pnorm(pheno.cg, pheno.mean, pheno.sd)
probnonnull <- 1 - 2*(quant*(quant<0.5) + (1-quant)*(quant>=0.5))
stat.b <- t(abs(pheno.cg-meanpheno)) %*% probnonnull / sum(probnonnull)
WOD.05 <- -stat.b
c(score1=score1, score1P=score1P, score2=score2, score2P=score2P,
score3=score3, score3P=score3P, score4=score4,
score4P=score4P, WOD.01 = WOD.01, WOD.05 = WOD.05)
}
##################################################################
#For a specific score, returns how often permuted data has a higher score than unpermuted data.
permwins <- function(scores.df, scorename) {
unpermuted <- (unpermutedScores[c(scorename)])[[1]]
ceiling(sum(scores.df[,c(scorename)] > unpermuted) + 0.5*sum(scores.df[,c(scorename)] == unpermuted))
}
##################################################################
#For all scores, returns how often permuted data has a higher score than unpermuted data.
PermLoop <- function(npermutation) {
countwin<- 0
pm<- 0
while ((pm < npermutation) | ((pm < npermutation.max) & (min(countwin) < min.nonsignificant.counts)))
{
pm<- pm + 1
scores <- getScores(permute=TRUE)
scores.df <- as.data.frame(t(scores))
pw1 <- permwins(scores.df, "score1")
pw1P <- permwins(scores.df, "score1P")
pw2 <- permwins(scores.df, "score2")
pw2P <- permwins(scores.df, "score2P")
pw3 <- permwins(scores.df, "score3")
pw3P <- permwins(scores.df, "score3P")
pw4 <- permwins(scores.df, "score4")
pw4P <- permwins(scores.df, "score4P")
pw.nll01 <- permwins(scores.df, "WOD.01")
pw.nll05 <- permwins(scores.df, "WOD.05")
if (pm==1) {
permresult <-
data.frame(t( c(score1=pw1, score1P=pw1P, score2=pw2,
score2P=pw2P, score3=pw3,
score3P=pw3P, score4=pw4, score4P=pw4P, WOD.01=pw.nll01,
WOD.05=pw.nll05))) } else {
permresult <- rbind(permresult,
data.frame(t( c(score1=pw1, score1P=pw1P, score2=pw2,
score2P=pw2P, score3=pw3,
score3P=pw3P, score4=pw4, score4P=pw4P, WOD.01=pw.nll01,
WOD.05=pw.nll05)))) }
countwin<- colSums(permresult)
}
list(permresult = permresult, pm =pm)
}
############################################################
#P-values
pval <- function(permwins){ (permwins+1)/(npermutation+1) }
#################################################################
####################################
# Function Calls start here
##################################
#Unpermuted data for which we're looking for pvalues
unpermutedScores <- as.data.frame(t(getScores(permute=FALSE)))
#print(unpermutedScores)
# Run npermutation
pwpm<- PermLoop(npermutation)
pw <- pwpm$permresult
totalpm<- pwpm$pm
pw.df <- as.data.frame(t(apply(pw, 2, sum)))
#cat("counts where permuted statistics are larger than unpermuted data\n")
#print(pw.df)
#cat("p-values\n")
#options(max.levels = NULL)
#options(width = 500)
#print(pval(pw.df), max.levels = NULL, width = 500)
list(score = unpermutedScores, nonsignificant.counts = pw.df, pvalue.empirical = pval(pw.df), pvalue.nominal = NA, total.permutation = totalpm)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.