#' Polygenic risk score (given only allele frequencies); training
#'
#' Equivalent to maximum likelihood naive Bayes classifier. The discriminant function is
#' \deqn{\sum_j\hat{\beta}_jX_j,}
#' where \eqn{X_j} is the additively coded genotype of SNP j.
#'
#' @param pi0,pi1 p x 1 vectors of control and case minor allele frequencies, respectively; IMPORTANT: must be relative to the same allele in both cases and controls
#' @param n0,n1 number of controls and number of cases, respectively
#'
#' @return
#' \item{pi0}{minor allele frequencies in controls}
#' \item{pi1}{minor allele frequencies in cases}
#' \item{P}{proportion of cases}
#'
#' @examples
#' p <- 1000; ## number of snps
#' I <- rep(0,p); I[1:10] <- 1; ## which snps are causal
#' set.seed(1); pi0 <- runif(p,0.1,0.5); ## control minor allele frequencies
#' set.seed(1); ors <- runif(sum(I),-1,1); ## odds ratios
#' pi1 <- pi0;
#' pi1[I==1] <- expit(ors+logit(pi0[I==1]));
#' n0 <- 100; ## number of controls
#' X0 <- t(replicate(n0,rbinom(p,2,pi0))); ## controls
#' n1 <- 50; ## number of cases
#' X1 <- t(replicate(n1,rbinom(p,2,pi1))); ## cases
#' prs.train(colMeans(X0)/2,colMeans(X1)/2,n0,n1);
#'
#' @import stats
#' @export
prs.train <- function(pi0,pi1,n0,n1){
if(sum(is.na(pi0))>0||sum(is.na(pi1))>0){
stop("Missing values in training data.");
}
## check for MAF
if(min(pi0)==0||min(pi1)==0||max(pi0)==1||max(pi1)==1){
stop("At least one SNP has MAF=0.");
}
return(list(pi0=pi0,pi1=pi1,P=n1/(n0+n1)));
}
#' Polygenic risk score (given only allele frequencies); training with CV
#'
#' Uses CV to select how many SNPs to include. SNPs are ordered by the magnitude of their estimated (ang possibly weighted) allelic log-odds ratio. The discriminant function is
#' \deqn{\sum_j\hat{\beta}_jI\left(\left\vert\frac{\hat{\beta}_j}{w_j}\right\vert>\lambda\right)X_j,}
#' where \eqn{X_j} is the additively coded genotype of SNP j.
#'
#' @param X0,X1 n x p vectors of control and case genotypes, additively coded; IMPORTANT: coding must be relative to the same allele in both cases and controls
#' @param K number of folds for CV
#' @param w p x 1 weight vector
#' @param nlambda number of thresholds to tune over
#' @param verbose if TRUE, report current fold of CV
#'
#' @return
#' \item{pi0}{minor allele frequencies in controls of kept SNPs}
#' \item{pi1}{minor allele frequencies in cases of kept SNPs}
#' \item{w}{weight vector for kept SNPs}
#' \item{lambda}{lambda cutoff}
#' \item{P}{proportion of cases}
#'
#' @examples
#' p <- 1000; ## number of snps
#' I <- rep(0,p); I[1:10] <- 1; ## which snps are causal
#' set.seed(1); pi0 <- runif(p,0.1,0.5); ## control minor allele frequencies
#' set.seed(1); ors <- runif(sum(I),-1,1); ## odds ratios
#' pi1 <- pi0;
#' pi1[I==1] <- expit(ors+logit(pi0[I==1]));
#' n0 <- 100; ## number of controls
#' X0 <- t(replicate(n0,rbinom(p,2,pi0))); ## controls
#' n1 <- 50; ## number of cases
#' X1 <- t(replicate(n1,rbinom(p,2,pi1))); ## cases
#' prs.train.cv(X0,X1,K=3,w=1,nlambda=100,verbose=TRUE);
#'
#' @import stats
#' @export
prs.train.cv <- function(X0,X1,K=3,w=1,nlambda=100,verbose=FALSE){
n0 <- nrow(X0); n1 <- nrow(X1); p <- ncol(X0);
P <- n1/(n0+n1);
pi1 <- colMeans(X1,na.rm=TRUE)/2;
pi0 <- colMeans(X0,na.rm=TRUE)/2;
b <- logit(pi1)-logit(pi0);
lambdas <- seq(0,max(abs(b/w)),length=nlambda);
## cv
ind0 <- sample(1:n0,n0,replace=FALSE);
ind1 <- sample(1:n1,n1,replace=FALSE);
inc0 <- floor(n0/K);
inc1 <- floor(n1/K);
errs.cv <- rep(0,nlambda);
for(k in 1:K){
if(verbose){
cat("Fold ",k," out of ",K,"...\n",sep="");
}
pi1.cv <- colMeans(X1[-ind1[((k-1)*inc1+1):(k*inc1)],],
na.rm=TRUE)/2;
pi0.cv <- colMeans(X0[-ind0[((k-1)*inc0+1):(k*inc0)],],
na.rm=TRUE)/2;
b.cv <- logit(pi1.cv)-logit(pi0.cv);
errs <- sapply(lambdas,function(l){
## thresholds at lambda
## CV might result in some infinite log-odds ratios
thresh <- which(abs(b.cv/w)>l&abs(b.cv/w)!=Inf);
if(length(thresh)==0){
scores <- rep(0,inc0+inc1);
} else {
b.cv <- b.cv[thresh];
pi1.cv <- pi1.cv[thresh];
pi0.cv <- pi0.cv[thresh];
newX.cv <- rbind(matrix(X0[ind0[((k-1)*inc0+1):(k*inc0)],thresh],
ncol=length(thresh)),
matrix(X1[ind1[((k-1)*inc1+1):(k*inc1)],thresh],
ncol=length(thresh)));
scores <- 2*sum(log((1-pi1.cv)/(1-pi0.cv)))+newX.cv%*%b.cv;
}
newY.cv <- c(rep(0,inc0),rep(1,inc1));
classes <- as.numeric(scores>-logit(inc1/(inc0+inc1)));
return(mean(abs(newY.cv-classes)));
});
errs.cv <- errs.cv+errs;
}
opt <- which.min(errs.cv);
return(list(pi0=pi0,pi1=pi1,w=w,lambda=lambdas[opt],P=P));
}
#' Polygenic risk score; prediction
#'
#' @param newX n x p matrix of additively coded genotypes to be predicted; IMPORTANT: must be coded relative to the same allele as in the cases and controls
#' @param prs output of prs.train()
#' @param P prevalence of cases in the testing set; if NULL, P is taken from the train object
#'
#' @return
#' \item{score}{risk score}
#' \item{class}{predicted class, 0=control, 1=case}
#'
#' @examples
#' p <- 1000; ## number of snps
#' I <- rep(0,p); I[1:10] <- 1; ## which snps are causal
#' set.seed(1); pi0 <- runif(p,0.1,0.5); ## control minor allele frequencies
#' set.seed(1); ors <- runif(sum(I),-1,1); ## odds ratios
#' pi1 <- pi0;
#' pi1[I==1] <- expit(ors+logit(pi0[I==1]));
#' ## training data
#' n0 <- 100; ## number of controls
#' X0 <- t(replicate(n0,rbinom(p,2,pi0))); ## controls
#' n1 <- 50; ## number of cases
#' X1 <- t(replicate(n1,rbinom(p,2,pi1))); ## cases
#' prs <- prs.train(colMeans(X0)/2,colMeans(X1)/2,n0,n1);
#' ## testing data
#' newX <- rbind(t(replicate(n0,rbinom(p,2,pi0))),
#' t(replicate(n1,rbinom(p,2,pi1))));
#' newY <- c(rep(0,n0),rep(1,n1));
#' Yhat <- prs.predict(newX,prs);
#' mean(abs(newY-Yhat$class));
#'
#' @import stats
#' @export
prs.predict <- function(newX,prs,P=NULL){
if(sum(complete.cases(newX))!=nrow(newX)){
stop("New data contains missing values.");
}
if(is.null(P)){
P <- prs$P;
}
b <- logit(prs$pi1)-logit(prs$pi0);
score <- 2*sum(log((1-prs$pi1)/(1-prs$pi0)))+
newX%*%b;
return(list(score=score,class=as.numeric(score>=-logit(P))));
}
#' Polygenic risk score; prediction for classifier trained with CV
#'
#' @param newX n x p matrix of additively coded genotypes to be predicted; IMPORTANT: must be coded relative to the same allele as in the cases and controls
#' @param prs output of prs.train.cv()
#' @param P prevalence of cases in the testing set; if NULL, P is taken from the train object
#'
#' @return
#' \item{score}{risk score}
#' \item{class}{predicted class, 0=control, 1=case}
#'
#' @examples
#' p <- 1000; ## number of snps
#' I <- rep(0,p); I[1:10] <- 1; ## which snps are causal
#' set.seed(1); pi0 <- runif(p,0.1,0.5); ## control minor allele frequencies
#' set.seed(1); ors <- runif(sum(I),-1,1); ## odds ratios
#' pi1 <- pi0;
#' pi1[I==1] <- expit(ors+logit(pi0[I==1]));
#' ## training data
#' n0 <- 100; ## number of controls
#' X0 <- t(replicate(n0,rbinom(p,2,pi0))); ## controls
#' n1 <- 50; ## number of cases
#' X1 <- t(replicate(n1,rbinom(p,2,pi1))); ## cases
#' prs <- prs.train.cv(X0,X1,K=3,w=1,nlambda=100,verbose=TRUE);
#' ## testing data
#' newX <- rbind(t(replicate(n0,rbinom(p,2,pi0))),
#' t(replicate(n1,rbinom(p,2,pi1))));
#' newY <- c(rep(0,n0),rep(1,n1));
#' Yhat <- prs.predict.cv(newX,prs);
#' mean(abs(newY-Yhat$class));
#'
#' @import stats
#' @export
prs.predict.cv <- function(newX,prs,P=NULL){
if(sum(complete.cases(newX))!=nrow(newX)){
stop("New data contains missing values.");
}
if(is.null(P)){
P <- prs$P;
}
b <- logit(prs$pi1)-logit(prs$pi0);
keep <- which(abs(b/prs$w)>prs$lambda);
if(length(keep)==0){
score <- rep(0,nrow(newX));
} else {
newX <- matrix(newX[,keep],ncol=length(keep));
score <- 2*sum(log((1-prs$pi1[keep])/(1-prs$pi0[keep])))+
newX%*%b[keep];
}
return(list(score=score,class=as.numeric(score>=-logit(P))));
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.