Nothing
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#' Perform GWAS using FarmCPU method
#'
#' Date build: Febuary 24, 2013
#' Last update: May 25, 2017
#' Requirement: Y, GD, and CV should have same taxa order. GD and GM should have the same order on SNPs
#'
#' @author Xiaolei Liu and Zhiwu Zhang
#'
#' @param phe phenotype, n by t matrix, n is sample size, t is number of phenotypes
#' @param geno genotype, m by n matrix, m is marker size, n is sample size. This is Pure Genotype Data Matrix(GD). THERE IS NO COLUMN FOR TAXA.
#' @param map SNP map information, m by 3 matrix, m is marker size, the three columns are SNP_ID, Chr, and Pos
#' @param CV covariates, n by c matrix, n is sample size, c is number of covariates
#' @param P start p values for all SNPs
#' @param method.sub method used in substitution process, five options: 'penalty', 'reward', 'mean', 'median', or 'onsite'
#' @param method.sub.final method used in substitution process, five options: 'penalty', 'reward', 'mean', 'median', or 'onsite'
#' @param method.bin method for selecting the most appropriate bins, three options: 'static', 'EMMA' or 'FaST-LMM'
#' @param bin.size bin sizes for all iterations, a vector, the bin size is always from large to small
#' @param bin.selection number of selected bins in each iteration, a vector
#' @param memo a marker on output file name
#' @param Prior prior information, four columns, which are SNP_ID, Chr, Pos, P-value
#' @param ncpus number of threads used for parallele computation
#' @param maxLoop maximum number of iterations
#' @param threshold.output only the GWAS results with p-values lower than threshold.output will be output
#' @param converge a number, 0 to 1, if selected pseudo QTNs in the last and the second last iterations have a certain probality (the probability is converge) of overlap, the loop will stop
#' @param iteration.output whether to output results of all iterations
#' @param p.threshold if all p values generated in the first iteration are bigger than p.threshold, FarmCPU stops
#' @param QTN.threshold in second and later iterations, only SNPs with lower p-values than QTN.threshold have chances to be selected as pseudo QTNs
#' @param bound maximum number of SNPs selected as pseudo QTNs in each iteration
#' @param verbose whether to print detail.
#'
#' @return a m by 4 results matrix, m is marker size, the four columns are SNP_ID, Chr, Pos, and p-value
#' @export
#'
#' @examples
#' \donttest{
#' phePath <- system.file("extdata", "07_other", "mvp.phe", package = "rMVP")
#' phenotype <- read.table(phePath, header=TRUE)
#' idx <- !is.na(phenotype[, 2])
#' phenotype <- phenotype[idx, ]
#' print(dim(phenotype))
#' genoPath <- system.file("extdata", "06_mvp-impute", "mvp.imp.geno.desc", package = "rMVP")
#' genotype <- attach.big.matrix(genoPath)
#' genotype <- deepcopy(genotype, cols=idx)
#' print(dim(genotype))
#' mapPath <- system.file("extdata", "06_mvp-impute", "mvp.imp.geno.map", package = "rMVP")
#' map <- read.table(mapPath , head = TRUE)
#'
#' farmcpu <- MVP.FarmCPU(phe=phenotype,geno=genotype,map=map,maxLoop=2,method.bin="static")
#' str(farmcpu)
#' }
#'
`MVP.FarmCPU` <- function(phe, geno, map, CV=NULL, P=NULL, method.sub="reward", method.sub.final="reward",
method.bin=c("EMMA", "static", "FaST-LMM"), bin.size=c(5e5,5e6,5e7), bin.selection=seq(10,100,10),
memo="MVP.FarmCPU", Prior=NULL, ncpus=2, maxLoop=10,
threshold.output=.01, converge=1, iteration.output=FALSE, p.threshold=NA,
QTN.threshold=0.01, bound=NULL, verbose=TRUE){
#print("--------------------- Welcome to FarmCPU ----------------------------")
if(!is.big.matrix(geno)) stop("genotype should be in 'big.matrix' format.")
if(sum(is.na(phe[, 2])) != 0) stop("NAs are not allowed in phenotype.")
if(nrow(phe) != ncol(geno)) stop("number of individuals not match in phenotype and genotype.")
echo=TRUE
nm=nrow(map)
if(!is.null(CV)){
CV=as.matrix(CV)
CV.index <- apply(CV, 2, function(x) length(table(x)) > 1)
CV <- CV[, CV.index, drop=FALSE]
npc=ncol(CV)
}else{
npc=0
}
method.bin=match.arg(method.bin)
map <- as.matrix(map)
suppressWarnings(max.chr <- max(as.numeric(map[, 2]), na.rm=TRUE))
if(is.infinite(max.chr)) max.chr <- 0
map.xy.index <- which(!as.numeric(map[, 2]) %in% c(0 : max.chr))
if(length(map.xy.index) != 0){
chr.xy <- unique(map[map.xy.index, 2])
for(i in 1:length(chr.xy)){
map[map[, 2] == chr.xy[i], 2] <- max.chr + i
}
}
map[, 1] = 1:nrow(map)
suppressWarnings(map <- matrix(as.numeric(map), nrow(map)))
if(sum(is.na(map[,3]) != 0)) stop("Non-digital characters or NAs are not allowed in map for FarmCPU")
if(!is.na(p.threshold)) QTN.threshold = max(p.threshold, QTN.threshold)
name.of.trait=colnames(phe)[2]
if(!is.null(memo)) name.of.trait=paste(memo,".",name.of.trait,sep="")
theLoop=0
theConverge=0
seqQTN.save=c(0)
seqQTN.pre=c(-1)
isDone=FALSE
name.of.trait2=name.of.trait
while(!isDone) {
theLoop=theLoop+1
logging.log(paste("Current loop: ",theLoop," out of maximum of ", maxLoop, sep=""), "\n", verbose = verbose)
spacer="0"
if(theLoop>9){
spacer=""
}
if(iteration.output){
name.of.trait2=paste("Iteration_",spacer,theLoop,".",name.of.trait,sep="")
}
#Step 2a: Set prior
myPrior=FarmCPU.Prior(GM=map,P=P,Prior=Prior)
#Step 2b: Set bins
if(theLoop<=2){
myBin=FarmCPU.BIN(Y=phe[,c(1,2)],GDP=geno,GM=map,CV=CV,P=myPrior,method=method.bin,b=bin.size,s=bin.selection,theLoop=theLoop,bound=bound,ncpus=ncpus, verbose = verbose)
}else{
myBin=FarmCPU.BIN(Y=phe[,c(1,2)],GDP=geno,GM=map,CV=theCV,P=myPrior,method=method.bin,b=bin.size,s=bin.selection,theLoop=theLoop,ncpus=ncpus, verbose = verbose)
}
#Step 2c: Remove bin dependency
#Remove QTNs in LD
seqQTN=myBin$seqQTN
if(theLoop==2){
if(!is.na(p.threshold)){
if(min(myPrior,na.rm=TRUE)>p.threshold){
seqQTN=NULL
logging.log("Top snps have little effect, set seqQTN to NULL!", "\n", verbose = verbose)
}
}else{
if(min(myPrior,na.rm=TRUE)>0.01/nm){
seqQTN=NULL
logging.log("Top snps have little effect, set seqQTN to NULL!", "\n", verbose = verbose)
}
}
}
#when FarmCPU can not work, make a new QQ plot and manhatthan plot
if(theLoop==2&&is.null(seqQTN)){
#Report
P=myGLM$P[,ncol(myGLM$P)]
P[P==0] <- min(P[P!=0],na.rm=TRUE)*0.01
results = cbind(myGLM$B, myGLM$S, P)
colnames(results) = c("effect", "se", "p")
break
}#force to exit for GLM model while seqQTN=NULL and h2=0
if (!any(is.null(seqQTN.save)) && theLoop > 1) {
if (!(0 %in% seqQTN.save || -1 %in% seqQTN.save) && !is.null(seqQTN)) {
#Force previous QTNs in the model
seqQTN <- union(seqQTN,seqQTN.save)
}
}
if(theLoop!=1){
seqQTN.p=myPrior[seqQTN]
if(theLoop==2){
index.p=seqQTN.p<QTN.threshold
#if(!is.na(p.threshold)){
#index.p=seqQTN.p<p.threshold
#}
seqQTN.p=seqQTN.p[index.p]
seqQTN=seqQTN[index.p]
seqQTN.p=seqQTN.p[!is.na(seqQTN)]
seqQTN=seqQTN[!is.na(seqQTN)]
}else{
index.p=seqQTN.p<QTN.threshold
#if(!is.na(p.threshold)){
#index.p=seqQTN.p<p.threshold
#}
index.p[seqQTN%in%seqQTN.save]=TRUE
seqQTN.p=seqQTN.p[index.p]
seqQTN=seqQTN[index.p]
seqQTN.p=seqQTN.p[!is.na(seqQTN)]
seqQTN=seqQTN[!is.na(seqQTN)]
}
}
myRemove=FarmCPU.Remove(GDP=geno,GM=map,seqQTN=seqQTN,seqQTN.p=seqQTN.p,threshold=.7)
#Recoding QTNs history
seqQTN=myRemove$seqQTN
theConverge=length(intersect(seqQTN,seqQTN.save))/length(union(seqQTN,seqQTN.save))
circle=(length(union(seqQTN,seqQTN.pre))==length(intersect(seqQTN,seqQTN.pre)))
#handler of initial status
if(is.null(seqQTN.pre)){circle=FALSE
}else{
if(seqQTN.pre[1]==0) circle=FALSE
if(seqQTN.pre[1]==-1) circle=FALSE
}
logging.log("seqQTN:", "\n", verbose = verbose)
if(is.null(seqQTN)){
logging.log("NULL", "\n", verbose = verbose)
}else{
logging.log(seqQTN, "\n", verbose = verbose)
}
if(theLoop==maxLoop){
logging.log(paste("Total number of possible QTNs in the model is: ", length(seqQTN),sep=""), "\n", verbose = verbose)
}
isDone=((theLoop>=maxLoop) | (theConverge>=converge) | circle )
seqQTN.pre=seqQTN.save
seqQTN.save=seqQTN
#Step 3: Screen with bins
rm(myBin)
gc()
theCV=CV
if(!is.null(myRemove$bin)){
if(length(myRemove$seqQTN) == 1){
#myRemove$bin = as.matrix(myRemove$bin)
myRemove$bin = t(myRemove$bin)
}
theCV=cbind(CV,myRemove$bin)
}
myGLM=FarmCPU.LM(y=phe[,2],GDP=geno,w=theCV,ncpus=ncpus,npc=npc, verbose=verbose)
if(!is.null(seqQTN)){
if(ncol(myGLM$P) != (npc + length(seqQTN) + 1)) stop("wrong dimensions.")
}
#Step 4: Background unit substitution
if(!isDone){
myGLM=FarmCPU.SUB(GM=map,GLM=myGLM,QTN=map[myRemove$seqQTN, , drop=FALSE],method=method.sub)
}else{
myGLM=FarmCPU.SUB(GM=map,GLM=myGLM,QTN=map[myRemove$seqQTN, , drop=FALSE],method=method.sub.final)
}
P=myGLM$P[,ncol(myGLM$P)]
P[P==0] <- min(P[P!=0], na.rm=TRUE)*0.01
results = cbind(myGLM$B, myGLM$S, P)
colnames(results) = c("effect", "se", "p")
} #end of while loop
#print("****************FarmCPU ACCOMPLISHED****************")
return(results)
}#The MVP.FarmCPU function ends here
#' FarmCPU.FaSTLMM.LL
#' Evaluation of the maximum likelihood using FaST-LMM method
#'
#' Last update: January 11, 2017
#' Requirement: pheno, snp.pool, and X0 must have same taxa order.
#' Requirement: No missing data
#'
#' @author Xiaolei Liu (modified)
#'
#' @param pheno a two-column phenotype matrix
#' @param snp.pool matrix for pseudo QTNs
#' @param X0 covariates matrix
#' @param ncpus number of threads used for parallel computation
#'
#' @return
#' Output: beta - beta effect
#' Output: delta - delta value
#' Output: LL - log-likelihood
#' Output: vg - genetic variance
#' Output: ve - residual variance
#'
`FarmCPU.FaSTLMM.LL` <- function(pheno, snp.pool, X0=NULL, ncpus=2){
y=pheno
p=0
deltaExpStart = -5
deltaExpEnd = 5
snp.pool=snp.pool[,]
if(!is.null(snp.pool)&&any(apply(snp.pool, 2, var)==0)){
deltaExpStart = 100
deltaExpEnd = deltaExpStart
}
if(is.null(X0)) {
X0 = matrix(1, nrow(snp.pool), 1)
}
X=X0
#########SVD of X
K.X.svd <- svd(snp.pool)
d=K.X.svd$d
d=d[d>1e-08]
d=d^2
U1=K.X.svd$u
U1=U1[,1:length(d)]
#handler of single snp
if(is.null(dim(U1))) U1=matrix(U1,ncol=1)
n=nrow(U1)
U1TX=crossprod(U1,X)
U1TY=crossprod(U1,y)
yU1TY <- y-U1%*%U1TY
XU1TX<- X-U1%*%U1TX
IU = -tcrossprod(U1)
diag(IU) = rep(1,n) + diag(IU)
IUX=crossprod(IU,X)
IUY=crossprod(IU,y)
#Iteration on the range of delta (-5 to 5 in glog scale)
delta.range <- seq(deltaExpStart,deltaExpEnd,by=0.1)
m <- length(delta.range)
#for (m in seq(deltaExpStart,deltaExpEnd,by=0.1)){
beta.optimize.parallel <- function(ii){
#p=p+1
delta <- exp(delta.range[ii])
#----------------------------calculate beta-------------------------------------
#######get beta1
beta1=0
for(i in 1:length(d)){
one=matrix(U1TX[i,], nrow=1)
beta=crossprod(one,(one/(d[i]+delta))) #This is not real beta, confusing
beta1= beta1+beta
}
#######get beta2
beta2=0
for(i in 1:nrow(U1)){
one=matrix(IUX[i,], nrow=1)
beta = crossprod(one)
beta2= beta2+beta
}
beta2<-beta2/delta
#######get beta3
beta3=0
for(i in 1:length(d)){
one1=matrix(U1TX[i,], nrow=1)
one2=matrix(U1TY[i,], nrow=1)
beta=crossprod(one1,(one2/(d[i]+delta)))
beta3= beta3+beta
}
###########get beta4
beta4=0
for(i in 1:nrow(U1)){
one1=matrix(IUX[i,], nrow=1)
one2=matrix(IUY[i,], nrow=1)
beta=crossprod(one1,one2)
beta4= beta4+beta
}
beta4<-beta4/delta
#######get final beta
zw1 <- ginv(beta1+beta2)
#zw1 <- try(solve(beta1+beta2))
#if(inherits(zw1, "try-error")){
#zw1 <- ginv(beta1+beta2)
#}
zw2=(beta3+beta4)
beta=crossprod(zw1,zw2)
#----------------------------calculate LL---------------------------------------
####part 1
part11<-n*log(2*3.14)
part12<-0
for(i in 1:length(d)){
part12_pre=log(d[i]+delta)
part12= part12+part12_pre
}
part13<- (nrow(U1)-length(d))*log(delta)
part1<- -1/2*(part11+part12+part13)
###### part2
part21<-nrow(U1)
######part221
part221=0
for(i in 1:length(d)){
one1=U1TX[i,]
one2=U1TY[i,]
part221_pre=(one2-one1%*%beta)^2/(d[i]+delta)
part221 = part221+part221_pre
}
part222=0
for(i in 1:n){
one1=XU1TX[i,]
one2=yU1TY[i,]
part222_pre=((one2-one1%*%beta)^2)/delta
part222= part222+part222_pre
}
part22<-n*log((1/n)*(part221+part222))
part2<- -1/2*(part21+part22)
################# likihood
LL<-part1+part2
part1<-0
part2<-0
return(list(beta=beta,delta=delta,LL=LL))
}
llresults <- lapply(1:m, beta.optimize.parallel)
for(i in 1:m){
if(i == 1){
beta.save = llresults[[i]]$beta
delta.save = llresults[[i]]$delta
LL.save = llresults[[i]]$LL
}else{
if(llresults[[i]]$LL > LL.save){
beta.save = llresults[[i]]$beta
delta.save = llresults[[i]]$delta
LL.save = llresults[[i]]$LL
}
}
}
#--------------------update with the optimum------------------------------------
beta=beta.save
delta=delta.save
LL=LL.save
#--------------------calculating Va and Vem-------------------------------------
#sigma_a1
sigma_a1=0
for(i in 1:length(d)){
one1=matrix(U1TX[i,], ncol=1)
one2=matrix(U1TY[i,], nrow=1)
#sigma_a1_pre=(one2-one1%*%beta)^2/(d[i]+delta)
sigma_a1_pre=(one2-crossprod(one1,beta))^2/(d[i]+delta)
sigma_a1= sigma_a1+sigma_a1_pre
}
### sigma_a2
sigma_a2=0
for(i in 1:nrow(U1)){
one1=matrix(IUX[i,], ncol=1)
one2=matrix(IUY[i,], nrow=1)
#sigma_a2_pre<-(one2-one1%*%beta)^2
sigma_a2_pre<-(one2-crossprod(one1,beta))^2
sigma_a2= sigma_a2+sigma_a2_pre
}
sigma_a2<-sigma_a2/delta
sigma_a<- 1/n*(sigma_a1+sigma_a2)
sigma_e<-delta*sigma_a
return(list(beta=beta, delta=delta, LL=LL, vg=sigma_a, ve=sigma_e))
}
#' FarmCPU.BIN
#'
#' Last update: March 28, 2017
#' Requirement: Y, GDP, and CV must have same taxa order. GDP and GM must have the same order on SNP
#' Requirement: P and GM are in the same order
#' Requirement: No missing data
#'
#' @author Xiaolei Liu and Zhiwu Zhang
#'
#' @param Y a n by 2 matrix, the fist column is taxa id and the second is trait
#' @param GDP genotype, m by n matrix, m is marker size, n is sample size. This is Pure Genotype Data Matrix(GD). THERE IS NO COLUMN FOR TAXA
#' @param GM SNP map information, m by 3 matrix, m is marker size, the three columns are SNP_ID, Chr, and Pos
#' @param CV covariates, n by c matrix, n is sample size, c is number of covariates
#' @param P start p values for all SNPs
#' @param method two options, 'static' or 'FaST-LMM'
#' @param b bin sizes for all iterations, a vector, the bin size is always from large to small
#' @param s number of selected bins in each iteration, a vector
#' @param theLoop iteration number
#' @param bound maximum number of SNPs selected as pseudo QTNs in each iteration
#' @param ncpus number of threads used for parallele computation
#' @param verbose whether to print detail.
#'
#' @return
#' Output: seqQTN - an s by 1 vecter for index of QTNs on GM file
#'
#' @keywords internal
FarmCPU.BIN <-
function(Y=NULL, GDP=NULL, GM=NULL, CV=NULL, P=NULL, method="EMMA", b=c(5e5,5e6,5e7), s=seq(10,100,10), theLoop=NULL, bound=NULL, ncpus=2, verbose=TRUE){
#print("FarmCPU.BIN Started")
if(is.null(P)) return(list(bin=NULL,binmap=NULL,seqQTN=NULL))
#Set upper bound for bin selection to squareroot of sample size
n=nrow(Y)
#bound=round(sqrt(n)/log10(n))
if(is.null(bound)){
bound=round(sqrt(n)/sqrt(log10(n)))
}
s[s>bound]=bound
s=unique(s[s<=bound]) #keep the within bound
optimumable=(length(b)*length(s)>1)
if(!optimumable & method!="optimum"){
method="static"
}
if(optimumable){
s[s>bound]=bound
#print("optimizing possible QTNs...")
#GP=cbind(GM,P,NA,NA,NA)
#mySpecify=FarmCPU.Specify(GI=GM, GP=GP, bin.size=b, inclosure.size=s)
#seqQTN=which(mySpecify$index==TRUE)
}
#Method of static
if(method=="static"&optimumable){
#print("Via static")
if(theLoop==2){
b=b[3]
}else if(theLoop==3){
b=b[2]
}else{
b=b[1]
}
s=bound
s[s>bound]=bound
logging.log("Optimizing Pseudo QTNs...", "\n", verbose = verbose)
GP=cbind(GM,P,NA,NA,NA)
mySpecify=FarmCPU.Specify(GI=GM,GP=GP,bin.size=b,inclosure.size=s)
seqQTN.save=which(mySpecify$index==TRUE)
}
#Method of optimum: FaST-LMM
#============================Optimize by FaST-LMM============================================
if(method=="FaST-LMM"&optimumable){
#print("c(bin.size, bin.selection, -2LL, VG, VE)")
logging.log("Optimizing Pseudo QTNs...", "\n", verbose = verbose)
count=0
for (bin in b){
for (inc in s){
count=count+1
GP=cbind(GM,P,NA,NA,NA)
mySpecify=FarmCPU.Specify(GI=GM,GP=GP,bin.size=bin,inclosure.size=inc)
seqQTN=which(mySpecify$index==TRUE)
GK=t(GDP[seqQTN,])
myBurger=FarmCPU.Burger(Y=Y[,1:2], CV=CV, GK=GK, ncpus=ncpus, method=method)
myREML=myBurger$REMLs
myVG=myBurger$vg #it is unused
myVE=myBurger$ve #it is unused
logging.log(c(bin,inc,myREML,myVG,myVE), "\n", verbose = verbose)
#Recoding the optimum GK
if(count==1){
seqQTN.save=seqQTN
LL.save=myREML
bin.save=bin
inc.save=inc
vg.save=myVG # for genetic variance
ve.save=myVE # for residual variance
}else{
if(myREML<LL.save){
seqQTN.save=seqQTN
LL.save=myREML
bin.save=bin
inc.save=inc
vg.save=myVG # for genetic variance
ve.save=myVE # for residual variance
}
} #end of if(count==1)
}#loop on bin number
}#loop on bin size
#seqQTN=seqQTN.save
}
#Method of optimum: EMMA
#============================Optimize by EMMA============================================
if(method=="EMMA"&optimumable){
#print("c(bin.size, bin.selection, -2LL, VG, VE)")
logging.log("Optimizing Pseudo QTNs...", "\n", verbose = verbose)
m <- length(b)*length(s)
inc.index = rep(c(1:length(s)), length(b))
seqQTN.optimize.parallel <- function(ii){
bin.index = floor((ii-0.1)/length(s)) + 1
bin = b[bin.index]
inc = s[inc.index[ii]]
GP=cbind(GM,P,NA,NA,NA)
mySpecify=FarmCPU.Specify(GI=GM,GP=GP,bin.size=bin,inclosure.size=inc)
seqQTN=which(mySpecify$index==TRUE)
GK=t(GDP[seqQTN,])
myBurger=FarmCPU.Burger(Y=Y[,1:2], CV=CV, GK=GK, ncpus=ncpus, method=method)
myREML=myBurger$REMLs
myVG=myBurger$vg #it is unused
myVE=myBurger$ve #it is unused
logging.log(c(bin,inc,myREML,myVG,myVE), "\n", verbose = verbose)
return(list(seqQTN=seqQTN,myREML=myREML))
}
llresults <- lapply(1:m, seqQTN.optimize.parallel)
for(i in 1:m){
if(i == 1){
seqQTN.save = llresults[[i]]$seqQTN
myREML.save = llresults[[i]]$myREML
}else{
if(llresults[[i]]$myREML < myREML.save){
seqQTN.save = llresults[[i]]$seqQTN
myREML.save = llresults[[i]]$myREML
}
}
}
}
#Method of optimum: GEMMA
#can not be used to provide REML
#============================Optimize by GEMMA============================================
if(method=="GEMMA"&optimumable){
#print("c(bin.size, bin.selection, -2LL, VG, VE)")
logging.log("Optimizing Pseudo QTNs...\n", verbose = verbose)
m <- length(b)*length(s)
seqQTN.optimize.parallel <- function(ii){
bin = floor((ii-0.1)/length(s)) + 1
inc = rep(c(1:length(s)), length(b))
GP=cbind(GM,P,NA,NA,NA)
mySpecify=FarmCPU.Specify(GI=GM,GP=GP,bin.size=bin[ii],inclosure.size=inc[ii])
seqQTN=which(mySpecify$index==TRUE)
GK=t(GDP[seqQTN,])
myBurger=FarmCPU.Burger(Y=Y[,1:2], CV=CV, GK=GK, ncpus=ncpus, method=method)
myREML=myBurger$REMLs
myVG=myBurger$vg #it is unused
myVE=myBurger$ve #it is unused
logging.log(c(bin,inc,myREML,myVG,myVE), "\n", verbose = verbose)
return(list(seqQTN=seqQTN,myREML=myREML))
}
llresults <- lapply(1:m, seqQTN.optimize.parallel)
for(i in 1:m){
if(i == 1){
seqQTN.save = llresults[[i]]$seqQTN
myREML.save = llresults[[i]]$myREML
}else{
if(llresults[[i]]$myREML < myREML.save){
seqQTN.save = llresults[[i]]$seqQTN
myREML.save = llresults[[i]]$myREML
}
}
}
}
return(list(seqQTN=seqQTN.save))
}#The function FarmCPU.BIN ends here
#' To get indicator (TURE or FALSE) for GI based on GP
#'
#' Last update: January 26, 2017
#' Strategy
#' 1.set bins for all snps in GP
#' 2.keep the snp with smallest P value in each bin, record SNP ID
#' 3.Search GI for SNP with SNP ID from above
#' 4.return the position for SNP selected
#'
#' @author Zhiwu Zhang
#'
#' @param GI Data frame with three columns (SNP name, chr and base position)
#' @param GP Data frame with seven columns (SNP name, chr and base position, P, MAF, N, effect)
#' @param bin.size a value of @param b in 'FarmCPU.bin' function
#' @param inclosure.size a value of @param s in 'FarmCPU.bin' function
#' @param MaxBP maximum base pairs for each chromosome
#'
#' @return theIndex: a vector indicating if the SNPs in GI belong to QTN or not
FarmCPU.Specify <-
function(GI=NULL, GP=NULL, bin.size=10000000, inclosure.size=NULL, MaxBP=1e10){
#print("Specification in process...")
if(is.null(GP))return (list(index=NULL,BP=NULL))
#set inclosure bin in GP
#Create SNP ID: position+CHR*MaxBP
ID.GP=as.numeric(as.vector(GP[, 3]))+as.numeric(as.vector(GP[, 2]))*MaxBP
#Creat bin ID
bin.GP=floor(ID.GP/bin.size )
#Create a table with bin ID, SNP ID and p value (set 2nd and 3rd NA temporately)
binP=as.matrix(cbind(bin.GP,NA,NA,ID.GP,as.numeric(as.vector(GP[,4]))) )
n=nrow(binP)
#Sort the table by p value and then bin ID (e.g. sort p within bin ID)
binP=binP[order(as.numeric(as.vector(binP[,5]))),] #sort on P alue
binP=binP[order(as.numeric(as.vector(binP[,1]))),] #sort on bin
#set indicator (use 2nd 3rd columns)
binP[2:n,2]=binP[1:(n-1),1]
binP[1,2]=0 #set the first
binP[,3]= binP[, 1]-binP[, 2]
#Se representives of bins
ID.GP=binP[binP[, 3]>0,]
#Choose the most influencial bins as estimated QTNs
#Handler of single row
if(is.null(dim(ID.GP))) ID.GP=matrix(ID.GP,1,length(ID.GP))
ID.GP=ID.GP[order(as.numeric(as.vector(ID.GP[, 5]))),] #sort on P alue
#Handler of single row (again after reshape)
if(is.null(dim(ID.GP))) ID.GP=matrix(ID.GP,1,length(ID.GP))
index=!is.na(ID.GP[, 4])
ID.GP=ID.GP[index,4] #must have chr and bp information, keep SNP ID only
if(!is.null(inclosure.size) ) {
if(!any(is.na(inclosure.size))){
avaiable=min(inclosure.size,length(ID.GP))
if(avaiable==0){
ID.GP=-1
}else{
ID.GP=ID.GP[1:avaiable] #keep the top ones selected
}
}
}
#create index in GI
theIndex=NULL
if(!is.null(GI)){
ID.GI=as.numeric(as.vector(GI[, 3]))+as.numeric(as.vector(GI[, 2]))*MaxBP
theIndex=ID.GI %in% ID.GP
}
myList=list(index=theIndex,CB=ID.GP)
return (list(index=theIndex,CB=ID.GP))
} #end of FarmCPU.Specify
#' To quickly sovel LM with one variable substitute multiple times
#'
#' Start date: March 1, 2013
#' Last update: March 6, 2013
#' Strategy:
#' 1. Separate constant covariates (w) and dynamic coveriates (x)
#' 2. Build non-x related only once
#' 3. Use apply to iterate x
#' 4. Derive dominance indicate d from additive indicate (x) mathmaticaly
#' 5. When d is not estimable, continue to test x
#'
#' @author Xiaolei Liu and Zhiwu Zhang
#'
#' @param y one column matrix, dependent variable
#' @param w covariates, n by c matrix, n is sample size, c is number of covariates
#' @param GDP genotype, m by n matrix, m is marker size, n is sample size. This is Pure Genotype Data Matrix(GD). THERE IS NO COLUMN FOR TAXA.
#' @param ncpus number of threads used for parallele computation
#' @param npc number of covariates without pseudo QTNs
#' @param verbose whether to print detail.
#'
#' @return
#' Output: P - p-value of each SNP
#' Output: betapred - effects of pseudo QTNs
#' Output: B - effect of each SNP
#'
#' @keywords internal
FarmCPU.LM <-
function(y, w=NULL, GDP, ncpus=2, npc=0, verbose=TRUE){
#print("FarmCPU.LM started")
if(is.null(y)) return(NULL)
if(is.null(GDP)) return(NULL)
#Constant section (non individual marker specific)
#---------------------------------------------------------
#Configration
nd=20 #number of markes for checking A and D dependency
N=length(y) #Total number of taxa, including missing ones
if(!is.null(w)){
w=as.matrix(w,nrow = N)
nf=ncol(w)
w=cbind(rep(1,N),w)
q0=ncol(w)
}else{
w=rep(1,N)
nf=0
q0=1
}
w = as.matrix(w)
logging.log("number of covariates in current loop is:", "\n", verbose = verbose)
logging.log(nf, "\n", verbose = verbose)
n=N
if(nd>n)nd=n #handler of samples less than nd
k=1 #number of genetic effect: 1 and 2 for A and AD respectively
q1=(q0+1) # vecter index for the posistion of genetic effect (a)
q2=(q0+1):(q0+2) # vecter index for the posistion of genetic effect (a and d)
df=n-q0-k #residual df (this should be varied based on validating d)
iXX=matrix(0,q0+k,q0+k) #Reserve the maximum size of inverse of LHS
# ww=crossprodcpp(w)
ww = crossprod(w)
wy = crossprod(w,y)
# yy=crossprodcpp(y)
yy = crossprod(y)
wwi = ginv(ww)
#Statistics on the reduced model without marker
rhs=wy
beta <- crossprod(wwi,rhs)
ve <- (yy - crossprod(beta, rhs)) / df
se <- sqrt(diag(wwi) * ve[1])
if(npc!=0){
betapc = beta[2:(npc+1)]
betapred = beta[-c(1:(npc+1))]
sepred = se[-c(1:(npc+1))]
}else{
betapc = NULL
betapred = beta[-1]
sepred = se[-1]
}
#print(ncpus)
logging.log("scanning...", "\n", verbose = verbose)
# P <- NULL
# B <- NULL
# S <- NULL
# setMKLthreads(10)
# for(i in 1:nrow(GDP)){
# xx <- summary(lm(y~cbind(w,GDP[i, ])))$coeff
# P <- rbind(P, xx[-1,4,drop=TRUE])
# B <- c(B, xx[nrow(xx), 1])
# S <- c(S, xx[nrow(xx), 2])
# }
# if(ncol(w) == 50) write.csv(P, "P.csv")
mkl_env({
results <- glm_c(y=y, X=w, iXX = wwi, GDP@address, verbose=verbose, threads=ncpus)
})
return(list(P=results[ ,-c(1:3), drop=FALSE], betapred=betapred, sepred=sepred, B=results[ , 1, drop=FALSE], S=results[ , 2, drop=FALSE]))
# return(list(P=P, betapred=betapred, B=as.matrix(B), S=S))
} #end of FarmCPU.LM function
#' FarmCPU.Burger
#'
#' Last update: Dec 21, 2016
#' To calculate likelihood, variances and ratio, revised by Xiaolei based on GAPIT.Burger function from GAPIT package
#'
#' @author Xiaolei Liu and Zhiwu Zhang
#'
#' @param Y phenotype, n by t matrix, n is sample size, t is number of phenotypes
#' @param CV covariates, n by c matrix, n is sample size, c is number of covariates
#' @param GK Genotype data in numerical format, taxa goes to row and snp go to columns
#' @param ncpus number of threads used for parallele computation
#' @param method two options for estimating variance components, 'FaST-LMM' or 'EMMA'
#'
#' @return
#' Output: REMLs - maximum log likelihood
#' Output: vg - genetic variance
#' Output: ve - residual variance
#' Output: delta - exp(root)
#'
#' @keywords internal
FarmCPU.Burger <-
function(Y=NULL,CV=NULL,GK=NULL,ncpus=2, method="FaST-LMM"){
if(!is.null(CV)){
CV=as.matrix(CV)#change CV to a matrix when it is a vector
theCV=as.matrix(cbind(matrix(1,nrow(CV),1),CV))
}else{
theCV=matrix(1,nrow(Y),1)
}
#handler of single column GK
n=nrow(GK)
m=ncol(GK)
if(!is.null(GK)){
n=nrow(GK)
m=ncol(GK)
if(m > 2){
theGK = as.matrix(GK)#GK is pure genotype matrix
}else if(m==0){
theGK = NULL
}else{
theGK = matrix(GK,n,1)
}
}else{
theGK=GK
}
if(method=="FaST-LMM"){
myFaSTREML=FarmCPU.FaSTLMM.LL(pheno=matrix(Y[,-1],nrow(Y),1), snp.pool=theGK, X0=theCV, ncpus=ncpus)
REMLs=-2*myFaSTREML$LL
delta=myFaSTREML$delta
vg=myFaSTREML$vg
ve=myFaSTREML$ve
}
if(method=="EMMA"){
K <- MVP.K.VanRaden(M=as.big.matrix(t(theGK)), priority="speed",verbose=FALSE)
myEMMAREML <- MVP.EMMA.Vg.Ve(y=matrix(Y[,-1],nrow(Y),1), X=theCV, K=K)
REMLs=-2*myEMMAREML$REML
delta=myEMMAREML$delta
vg=myEMMAREML$vg
ve=myEMMAREML$ve
}
#print("FarmCPU.Burger succeed!")
return (list(REMLs=REMLs, vg=vg, ve=ve, delta=delta))
} #end of FarmCPU.Burger
#' FarmCPU.SUB
#'
#' Last update: Febuary 26, 2013
#' Requirement: P has row name of SNP. s<=t. covariates of QTNs are next to SNP
#'
#' @author Xiaolei Liu and Zhiwu Zhang
#'
#' @param GM SNP map information, m by 3 matrix, m is marker size, the three columns are SNP_ID, Chr, and Pos
#' @param GLM FarmCPU.GLM object
#' @param QTN s by 3 matrix for SNP name, chromosome and BP
#' @param method options are "penalty", "reward","mean","median",and "onsite"
#'
#' @return
#' Output: GLM$P - Updated p-values by substitution process
#' Output: GLM$B - Updated effects by substitution process
FarmCPU.SUB <-
function(GM=NULL,GLM=NULL,QTN=NULL,method="mean"){
if(is.null(GLM$P)) return(NULL) #P is required
if(is.null(QTN)) return(NULL) #QTN is required
#print("FarmCPU.SUB Started")
#print(length(QTN))
#if(length(QTN)==3){
#QTN=QTN[1]
#}else{
QTN=QTN[ , 1, drop=FALSE]
#}
position=match(QTN, GM[, 1, drop=FALSE], nomatch = 0)
nqtn=length(position)
if(is.numeric(GLM$P)){
GLM$P = as.matrix(GLM$P)
}
GLM$B = as.matrix(GLM$B)
index=(ncol(GLM$P)-nqtn):(ncol(GLM$P)-1)
spot=ncol(GLM$P)
if(ncol(GLM$P)!=1){
if(length(index)>1){
if(method=="penalty") P.QTN=apply(GLM$P[,index],2,max,na.rm=TRUE)
if(method=="reward") P.QTN=apply(GLM$P[,index],2,min,na.rm=TRUE)
if(method=="mean") P.QTN=apply(GLM$P[,index],2,mean,na.rm=TRUE)
if(method=="median") P.QTN=apply(GLM$P[,index],2,median,na.rm=TRUE)
if(method=="onsite") P.QTN=GLM$P0[(length(GLM$P0)-nqtn+1):length(GLM$P0)]
}else{
if(method=="penalty") P.QTN=max(GLM$P[,index, drop=FALSE],na.rm=TRUE)
if(method=="reward") P.QTN=min(GLM$P[,index, drop=FALSE],na.rm=TRUE)
if(method=="mean") P.QTN=mean(GLM$P[,index, drop=FALSE],na.rm=TRUE)
if(method=="median") P.QTN=median(GLM$P[,index, drop=FALSE],median,na.rm=TRUE)
if(method=="onsite") P.QTN=GLM$P0[(length(GLM$P0)-nqtn+1):length(GLM$P0)]
}
#replace SNP pvalues with QTN pvalue
GLM$P[position, spot] = P.QTN
GLM$B[position, ] = GLM$betapred
GLM$S[position, ] = GLM$sepred
}
return(GLM)
}#The function FarmCPU.SUB ends here
#' Remove bins that are highly correlated
#'
#' Last update: March 4, 2013
#' Requirement: GDP and GM have the same order on SNP
#'
#' @author Xiaolei Liu and Zhiwu Zhang
#'
#' @param GDP genotype, m by n matrix, m is marker size, n is sample size. This is Pure Genotype Data Matrix(GD). THERE IS NO COLUMN FOR TAXA
#' @param GM SNP map information, m by 3 matrix, m is marker size, the three columns are SNP_ID, Chr, and Pos
#' @param seqQTN s by 1 vecter for index of QTN on GM
#' @param seqQTN.p p value of each @param seqQTN
#' @param threshold pearson correlation threshold for remove correlated markers
#'
#' @return
#' Output: bin - n by s0 matrix of genotype
#' Output: binmap - s0 by 3 matrix for map of bin
#' Output: seqQTN - s0 by 1 vecter for index of QTN on GM
#' Relationship: bin=GDP[,c(seqQTN)], binmap=GM[seqQTN,], s0<=s
#' @keywords internal
FarmCPU.Remove <-
function(GDP=NULL, GM=NULL, seqQTN=NULL, seqQTN.p=NULL, threshold=.99){
if(is.null(seqQTN))return(list(bin=NULL,binmap=NULL,seqQTN=NULL))
seqQTN=seqQTN[order(seqQTN.p)]
hugeNum=10e10
n=length(seqQTN)
#fielter bins by physical location
binmap=GM[seqQTN, , drop=FALSE]
cb=as.numeric(binmap[, 2, drop=FALSE])*hugeNum+as.numeric(binmap[, 3, drop=FALSE])#create ID for chromosome and bp
cb.unique=unique(cb)
#print("debuge")
#print(cb)
#print(cb.unique)
index=match(cb.unique,cb,nomatch = 0)
seqQTN=seqQTN[index]
#print("Number of bins after chr and bp fillter")
n=length(seqQTN) #update n
#Set sample
ratio=.1
maxNum=100000
m=nrow(GDP) #sample size
s=ncol(GDP) #number of markers
sampled=s
if(sampled>maxNum)sampled=maxNum
#index=sample(s,sampled)
index=1:sampled
#This section has problem of turning big.matrix to R matrix
#It is OK as x is small
if(is.big.matrix(GDP)){
x=t(as.matrix(deepcopy(GDP,rows=seqQTN,cols=index) ))
}else{
x=t(GDP[seqQTN,index] )
}
r=cor(as.matrix(x))
index=abs(r)>threshold
b=r*0
b[index]=1
c=1-b
#The above are replaced by following
c[lower.tri(c)]=1
diag(c)=1
bd <- apply(c,2,prod)
position=(bd==1)
seqQTN=seqQTN[position]
#============================end of optimum============================================
seqQTN=seqQTN[!is.na(seqQTN)]
#This section has problem of turning big.matrix to R matrix
if(is.big.matrix(GDP)){
bin=t(as.matrix(deepcopy(GDP,rows=seqQTN,) ))
}else{
bin=t(GDP[seqQTN, , drop=FALSE] )
}
binmap=GM[seqQTN, , drop=FALSE]
return(list(bin=bin, binmap=binmap, seqQTN=seqQTN))
}#The function FarmCPU.Remove ends here
#' Set prior on existing p value
#'
#' Last update: March 10, 2013
#' Requirement: P and GM are in the same order, Prior is part of GM except P value
#'
#' @author Xiaolei Liu and Zhiwu Zhang
#'
#' @param GM an m by 3 matrix for SNP name, chromosome and BP
#' @param P an m by 1 matrix containing probability
#' @param Prior an s by 4 matrix for SNP name, chromosome, BP and p-value
#'
#' @return
#' Output: P - updated P value by prior information
#'
#' @keywords internal
FarmCPU.Prior <-
function(GM, P=NULL, Prior=NULL){
#print("FarmCPU.Prior Started")
if(is.null(Prior)& is.null(P))return(P)
#get prior position
if(!is.null(Prior)) index=match(Prior[, 1, drop=FALSE],GM[, 1, drop=FALSE],nomatch = 0)
#if(is.null(P)) P=runif(nrow(GM)) #set random p value if not provided (This is not helpful)
#print("debug set prior a")
#Get product with prior if provided
if(!is.null(Prior) & !is.null(P) )P[index]=P[index]*Prior[, 4, drop=FALSE]
return(P)
}#The function FarmCPU.Prior ends here
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.