#' @importFrom grDevices cm.colors colorRampPalette dev.list
#' @importFrom grDevices dev.new dev.off heat.colors jpeg palette
#' @importFrom grDevices pdf png rainbow tiff
#' @importFrom graphics abline axis barplot box boxplot curve
#' @importFrom graphics dotchart hist layout legend lines mtext
#' @importFrom graphics par pie plot points polygon segments text
#' @importFrom methods is
#' @importFrom stats as.dendrogram cor cutree dist ecdf
#' @importFrom stats formula hatvalues hclust lm lsfit median
#' @importFrom stats na.omit pchisq pf pnorm prcomp pt qbeta
#' @importFrom stats qqplot qt quantile reformulate residuals
#' @importFrom stats rnorm runif uniroot var
#' @importFrom utils head install.packages installed.packages
#' @importFrom utils memory.limit memory.size object.size read.csv
#' @importFrom utils read.delim read.table write.csv write.table
#' @importFrom compiler cmpfun
#' @importFrom ape as.phylo
#' @importFrom lme4 lmer lmerControl
#' @importFrom MASS ginv
#' @importFrom plotly plot_ly add_lines add_trace
#' @importFrom grid grid.edit gPath gpar
#' @importFrom EMMREML emmreml
#' @importFrom scatterplot3d scatterplot3d
#' @importFrom bigmemory deepcopy is.big.matrix
#' @importFrom LDheatmap LDheatmap
`FarmCPU.0000` <-
function(){
#################################################################
#FarmCPU: Fixed and random model Circuitous Probability Unification
#This is an R package to perform GWAS and genome prediction
#Designed by Zhiwu Zhang
#Writen by Xiaolei Liu and Zhiwu Zhang
#Thanks for Aaron Kusmec pointing out the bug in 'FarmCPU.Burger' function
FarmCPU.Version="FarmCPU v1.02, Dec 21, 2016"
return(FarmCPU.Version)
}
`FarmCPU.BIN` <-
function(Y=NULL,GDP=NULL,GM=NULL,CV=NULL,P=NULL,orientation="col",method="random",b=c(5e5,5e6,5e7),s=seq(10,100,10),theLoop=NULL,bound=NULL){
#Input: Y - n by 2 matrix with fist column as taxa name and second as trait
#Input: GDP - n by m+1 matrix. The first colum is taxa name. The rest are m genotype
#Input: GM - m by 3 matrix for SNP name, chromosome and BP
#Input: CV - n by t matrix for t covariate variables.
#Input: P - m by 1 matrix containing probability
#Input: method - options are "static", "optimum", and "integral"
#Input: b - vecter of length>=1 for bin size
#Input: s - vecter of length>=1 for size of complexity (number of QTNs)
#Requirement: Y, GDP and CV have same taxa order. GDP and GM have the same order on SNP
#Requirement: P and GM are in the same order
#Requirement: No missing data
#Output: bin - n by s matrix of genotype
#Output: binmap - s by 3 matrix for map of bin
#Output: seqQTN - s by 1 vecter for index of QTN on GM (+1 for GDP column wise)
#Relationship: bin=GDP[,c(seqQTN)], binmap=GM[seqQTN,]
#Authors: Zhiwu Zhang
# Last update: Febuary 28, 2013
##############################################################################
#print("FarmCPU.BIN Started")
#print("bin size")
#print(b)
#print("bin selection")
#print(s)
#print("method specified:")
#print(method)
if(is.null(P)) return(list(bin=NULL,binmap=NULL,seqQTN=NULL))
#Set upper bound for bin selection to squareroot of sample size
n=nrow(Y)
#bound=round(sqrt(n)/log10(n))
if(is.null(bound)){
bound=round(sqrt(n)/sqrt(log10(n)))
}
#bound=round(sqrt(n))
#bound=round(n/log10(n))
#bound=n-1
s[s>bound]=bound
s=unique(s[s<=bound]) #keep the within bound
#print("number of bins allowed")
#print(s)
optimumable=(length(b)*length(s)>1)
if(!optimumable & method=="optimum"){
#print("Warning: method was changed from optimum to static")
method="static"
}
#print("method actually used:")
#print(method)
#Method of random
#if(method=="random") seqQTN=sample(nrow(GM),s) #this is for test only
#Method of static
if(method=="static"){
#print("Via static")
if(theLoop==2){
b=b[3]
}else if(theLoop==3){
b=b[2]
}else{
b=b[1]
}
s=bound
#b=median(b)
#s=median(s)
s[s>bound]=bound
#print("Bin : bin.size, bin.selection")
#print(c(b,s))
print("optimizing possible QTNs...")
GP=cbind(GM,P,NA,NA,NA)
mySpecify=GAPIT.Specify(GI=GM,GP=GP,bin.size=b,inclosure.size=s)
seqQTN=which(mySpecify$index==TRUE)
#print("Bin set through static")
}
#Method of optimum
#============================optimum start============================================
if(method=="optimum"&optimumable){
#print("optimizing bins")
#print("c(bin.size, bin.selection, -2LL, VG, VE)")
print("optimizing possible QTNs...")
count=0
for (bin in b){
for (inc in s){
count=count+1
GP=cbind(GM,P,NA,NA,NA)
#print("debug in bin 000")
#print("calling Specify")
#print(date())
mySpecify=GAPIT.Specify(GI=GM,GP=GP,bin.size=bin,inclosure.size=inc)
#print("calling Specify done")
#print(date())
seqQTN=which(mySpecify$index==TRUE)
#print("seqQTN")
#print(seqQTN)
if(orientation=="col"){
if(is.big.matrix(GDP)){
GK=deepcopy(GDP,cols=seqQTN)
}else{
GK=GDP[,seqQTN] #GK has the first as taxa in FarmCPU.Burger. But not get uesd.
#GK=GDP[,seqQTN]
}
}else{
#if(is.big.matrix(GDP)){
#GK=deepcopy(GDP,rows=seqQTN)
#GK=t(GK)
#}else{
#GK=cbind(Y[,1],t(GDP[c(1,seqQTN),])) #GK has the first as taxa in FarmCPU.Burger. But not get uesd.
#some problem here
GK=t(GDP[seqQTN,])
#}
}
#print("GK")
#print(GK)
#print("calling Burger")
#print(date())
myBurger=FarmCPU.Burger(Y=Y[,1:2],CV=CV,GK=GK)
#print("calling Burger done")
#print(date())
myREML=myBurger$REMLs
myVG=myBurger$vg #it is unused
myVE=myBurger$ve #it is unused
#print("c(bin.size, bin.selection, -2LL, VG, VE)")
print(c(bin,inc,myREML,myVG,myVE))
#Recoding the optimum GK
if(count==1){
seqQTN.save=seqQTN
LL.save=myREML
bin.save=bin
inc.save=inc
vg.save=myVG # for genetic variance
ve.save=myVE # for residual variance
}else{
if(myREML<LL.save){
seqQTN.save=seqQTN
LL.save=myREML
bin.save=bin
inc.save=inc
vg.save=myVG # for genetic variance
ve.save=myVE # for residual variance
}
} #end of if(count==1)
}#loop on bin number
}#loop on bin size
seqQTN=seqQTN.save
#ve.save=ve.save
#vg.save=vg.save
#print(seqQTN)
#print("Bin optimized: -2LL, bin.size, bin.selection")
#print(c(LL.save,bin.save,inc.save))
#print(LL.save)
#print("bin.save")
#print(bin.save)
#print("inc.save")
#print(inc.save)
}
#============================end of optimum============================================
bin=NULL
binmap=NULL
#The following are commented out as they will be finalized in Remove function
#if(orientation=="col"){
# bin=GDP[,seqQTN]
#}else{
# bin=t(GDP[seqQTN,] )
#}
#binmap=GM[seqQTN,]
#print(length(seqQTN))
#print("FarmCPU.Bin accomplished successfully!")
return(list(bin=bin,binmap=binmap,seqQTN=seqQTN))
}#The function FarmCPU.BIN ends here
`FarmCPU.GLM` <-
function(Y=NULL,GDP=NULL,GM=NULL,CV=NULL,orientation="row",package="FarmCPU.LM",model="A",ncpus=1,seqQTN=NULL,npc=0){
#Object: To perform GWAS with GLM model
#Input: Y - n by 2 matrix with fist column as taxa name and second as trait
#Input: GDP - n by m matrix. This is Genotype Data Pure (GDP). THERE IS NOT COLUMN FOR TAXA.
#Input: GM - m by 3 matrix for SNP name, chromosome and BP
#Input: CV - n by t matrix for t covariate variables.
#Requirement: Y, GDP and CV have same taxa order. GDP and GM have the same order on SNP
#Output: P - m by 4(t+1) matrix containing estimate, tvalue, stderr and pvalue for covariates and SNP
#Authors: Xiaolei Liu and Zhiwu Zhang
# Last update: may 9, 2012
##############################################################################
if(is.null(Y)) return(NULL) #Y is required
if(is.null(GDP) & is.null(CV)) return(NULL) #Need to have either genotype of CV
#print("FarmCPU.GLM Started")
#print("Dimention of Y, GDP, GM, and CV")
#print(dim(Y))
#print(dim(GDP))
#print(dim(GM))
#print(dim(CV))
#print(head(CV))
#print("Solving equation (This may take a while)...")
if(!is.null(CV)){
CV=as.matrix(CV)
nf=ncol(CV)
}else{
nf=0
}
print("number of covariates in current loop is:")
print(nf)
#Build model with SNP as the last variable
#if(package!="FarmCPU.LM"){
if(is.null(CV)) {
myModel="Y [,2]~x"
if(package!="fast.lm"){
ccv=rep(1,nrow(Y))
}
}else{
#CV=as.matrix(CV)
seqCV=1:(ncol(CV))
myModel=paste("Y[,2]~",paste("CV[,",(seqCV),"]",collapse= "+"),"+ x")
#print(head(CV))
#ccv=cbind(rep(1,nrow(Y)),as.matrix(CV[,2:ncol(CV)]))
if(package!="fast.lm"){
ccv=cbind(rep(1,nrow(Y)),as.matrix(CV))
}
#ccv=as.matrix(CV)
#print("ccv")
#print(head(ccv))
}
#}
##print("The model is: ")
##print(myModel)
#===========================by lm=======================================
if(package=="lm"){
#Solve the model with lm
#P <- apply(GDP,2,function(x){
#fmla <- formula(myModel)
#myLM=lm(fmla)
#lms=summary(myLM)
#lmcoef=lms$coefficients
#lmcoefOnly=lmcoef[-1,] #remove intercept
##print(lmcoefOnly)
#(as.numeric(lmcoefOnly))
#In order of estimate, t, se and P
#cbind(lmcoefOnly[,1],lmcoefOnly[,3],lmcoefOnly[,2],lmcoefOnly[,4])
#})
P<-matrix(NA,nr=nrow(GDP),nc=4*(nf+1))
for(i in 1:nrow(GDP)){
x <- GDP[i,]
fmla <- formula(myModel)
myLM=lm(fmla)
lms=summary(myLM)
lmcoef=lms$coefficients
lmcoefOnly=lmcoef[-1,] #remove intercept
##print(lmcoefOnly)
#(as.numeric(lmcoefOnly))
#In order of estimate, t, se and P
#P[i,]=cbind(lmcoefOnly[,1],lmcoefOnly[,3],lmcoefOnly[,2],lmcoefOnly[,4])
#P[i,1]=lmcoefOnly[1]
#P[i,2]=lmcoefOnly[3]
#P[i,3]=lmcoefOnly[2]
#P[i,4]=lmcoefOnly[4]
#print(lmcoefOnly)
P[i,c(1:(nf+1))]=lmcoefOnly[1:(nf+1)]
P[i,c((nf+2):(2*nf+2))]=lmcoefOnly[(2*nf+3):(3*nf+3)]
P[i,c((2*nf+3):(3*nf+3))]=lmcoefOnly[(nf+2):(2*nf+2)]
P[i,c((3*nf+4):(4*nf+4))]=lmcoefOnly[(3*nf+4):(4*nf+4)]
#P[i,c(1:(nf+1))]=lmcoefOnly[1]
#P[i,c((nf+2):(2*nf+2))]=lmcoefOnly[3]
#P[i,c((2*nf+3):(3*nf+3))]=lmcoefOnly[2]
#P[i,c((3*nf+4):(4*nf+4))]=lmcoefOnly[4]
}
#print(head(P))
#convert list to numeric matrix
#P=t(sapply(P, function(row, max_length) c(row, rep(NA, max_length - length(row))), max(sapply(P, length))))
#the following two do not work
#P=as.data.frame(do.call(rbind, P))
#P=t(as.matrix(P))
P0=NULL
pred=NULL
PF=P[,ncol(P)]
myLM=list(P=P,P0=P0,PF=PF,Pred=pred)
} #end of lm if statement
#===========================by fast.lm=======================================
if(package=="fast.lm"){
#fast.lm does not all?ow missing values
missing=is.na(Y[,2]) #index for missing phenotype
Mtotal=ncol(GDP)
Ym=Y[missing,]
Y=Y[!missing,]
ccv=ccv[!missing,]
GDP=GDP[!missing,]
#set index for markers with no variation
varSNP=apply(GDP,2,var)
indexSNP=which(varSNP!=0)
P0 <- apply(GDP[,indexSNP],2,function(x){
x = cbind(ccv,x)
fast.lm=fastLmPure(y=Y[,2],X = x)
tvalue=fast.lm$coefficients[-1]/fast.lm$stderr[-1]
pvalue=2*pt(abs(tvalue),fast.lm$df.residual,lower.tail = FALSE)
cbind(fast.lm$coefficients[-1],tvalue,fast.lm$stderr[-1],pvalue)
})
#convert list to numeric matrix, the last (t+1) columns are p values for SNPs
P0=t(as.matrix(P0))
#Restore in original oder
mtotal=ncol(GDP)
nfix=ncol(P0)
P=matrix(NA,Mtotal,nfix)
rownames(P)=colnames(GDP) #This should be OK
P[indexSNP,]=P0 #restore the order with markers without variation
P0=NULL
pred=NULL
PF=P[,ncol(P)]
myLM=list(P=P,P0=P0,PF=PF,Pred=pred)
}# end of fast.lm if statement
#===========================by FarmCPU.LM=======================================
if(package=="FarmCPU.LM"){
#print("Calling GLM")
#print(date())
#print("Memory used before calling LM")
#print(memory.size())
gc()
theCV=NULL
if(!is.null(CV)) {
theCV=as.matrix(CV)#as.matrix(CV[,-1])#
seqCV=1:(ncol(theCV))
myModel=paste("y~",paste("w[,",(seqCV),"]",collapse= "+"),"+ x")
}
#print("theCV")
#print(head(theCV))
if(ncpus==1)myLM=FarmCPU.LM(y=Y[,2],w=theCV,GDP=GDP,orientation=orientation,model=model,ncpus=ncpus,myModel=myModel,seqQTN=seqQTN,npc=npc)
if(ncpus>1)myLM=FarmCPU.LM.Parallel(y=Y[,2],w=theCV,x=GDP,orientation=orientation,model=model,ncpus=ncpus,npc=npc)
#print("Memory used after calling LM")
#print(memory.size())
gc()
}# end of FarmCPU.lm if statement
#print("FarmCPU.GLM accoplished")
#print(date())
gc()
#return(list(P=myLM$P,P0=myLM$P0,PF=myLM$PF,Pred=myLM$pred))
return(myLM)
}#The function FarmCPU.GLM ends here
`FarmCPU.Inv` <- function(A){
#Object: To invert a 2 by 2 matrix quickly
#intput: A - 2 by 2 matrix
#Output: Inverse
#Authors: Zhiwu Zhang
# Last update: March 6, 2013
##############################################################################################
detA=A[1,1]*A[2,2]-A[1,2]*A[2,1]
temp=A[1,1]
A=-A
A[1,1]=A[2,2]
A[2,2]=T
return(A/detA)
}#The function FarmCPU.Inv ends here
`FarmCPU.LM.Parallel` <-
function(y,w=NULL,x,orientation="col",model="A",ncpus=2){
#Object: 1. To quickly sovel LM with one variable substitute multiple times
#Object: 2. To fit additive and additive+dominace model
#intput: y - dependent variable
#intput: w - independent variable
#intput: x - independent variable of substitution (GDP)
#intput: model - genetic effects. Options are "A" and "AD"
#Output: estimate, tvalue, stderr and pvalue ( plus the P value of F test on both A and D)
#Straitegy: 1. Separate constant covariates (w) and dynamic coveriates (x)
#Straitegy: 2. Build non-x related only once
#Straitegy: 3. Use apply to iterate x
#Straitegy: 4. Derive dominance indicate d from additive indicate (x) mathmaticaly
#Straitegy: 5. When d is not estimable, continue to test x
#Authors: Xiaolei Liu and Zhiwu Zhang
#Start date: March 1, 2013
#Last update: March 6, 2013
##############################################################################################
print("FarmCPU.LM started")
print(date())
print(paste("No. Obs: ",length(y),sep=""))
print("diminsion of covariates and markers")
if(!is.null(w))print(dim(w))
print("Memory used at begining of LM")
print(memory.size())
gc()
#Constant section (non individual marker specific)
#---------------------------------------------------------
#Configration
nd=20 #number of markes for checking A and D dependency
threshold=.99 # not solving d if correlation between a and d is above this
N=length(y) #Total number of taxa, including missing ones
direction=2
if(orientation=="row")direction=1
print("direction")
print(direction)
#Handler of non numerical y a and w
if(!is.null(w)){
nf=length(w)/N
w=matrix(as.numeric(as.matrix(w)),N,nf )
w=cbind(rep(1,N),w)#add overall mean indicator
q0=ncol(w) #Number of fixed effect excluding gnetic effects
}else{
w=rep(1,N)
nf=0
q0=1
}
y=matrix(as.numeric(as.matrix(y)),N,1 )
print("Adding overall mean")
print(date())
print("Build the static section")
print(date())
#n=nrow(w) #number of taxa without missing
n=N
if(nd>n)nd=n #handler of samples less than nd
k=1 #number of genetic effect: 1 and 2 for A and AD respectively
if(model=="AD")k=2
q1=(q0+1) # vecter index for the posistion of genetic effect (a)
q2=(q0+1):(q0+2) # vecter index for the posistion of genetic effect (a and d)
df=n-q0-k #residual df (this should be varied based on validating d)
iXX=matrix(0,q0+k,q0+k) #Reserve the maximum size of inverse of LHS
#theNA=c(rep(NA,q0),rep(0,k)) # this should not be useful anymore
ww=crossprod(w,w)
wy=crossprod(w,y)
yy=crossprod(y,y)
wwi=solve(ww)
print("Prediction")
print(date())
#Statistics on the reduced model without marker
rhs=wy
beta <- crossprod(wwi,rhs)
ve=(yy-crossprod(beta,rhs))/df
se=sqrt(diag(wwi)*ve)
tvalue=beta/se
pvalue <- 2 * pt(abs(tvalue), df,lower.tail = FALSE)
P0=c(beta[-1],tvalue[-1],se[-1],pvalue[-1])
yp=w%*%beta
print("Detecting genotype coding system")
print(date())
#Finding the middle of genotype coding (1 for 0/1/2 and 0 for -1/0/1)
s=5 # number of taxa sampled
t0=which(x[1:s,]<0)
t1=which(x[1:s,]>1)
middle=0
if(length(t0)<length(t1)) middle=1
print("Memory used after setting LM")
print(memory.size())
gc()
#Dynamic section on individual marker
print("Iterating.................")
print(date())
print("dimension of GD")
print(dim(x))
print(is(x))
#sfInit(parallel=ncpus>1, cpus=ncpus)
#print(sprintf('%s cpus are used', sfCpus()))
#---------------------------------------------------------
#P <- apply(x,direction,function(x){
P <- sfApply(x,direction,function(x){
print("debug sfApply")
r=1 #initial creteria for correlation between a and d
if(model=="AD"){
d=1-abs(x-middle)
r=abs(cor(x[1:nd],d[1:nd]))
if(is.na(r))r=1
if(r<=threshold) x=cbind(x,d) # having both a and d as marker effects
}
print("make some noise here")
#Process the edge (marker effects)
xw=crossprod(w,x)
xy=crossprod(x,y)
xx=crossprod(x,x)
B21 <- crossprod(xw, wwi)
#t1=crossprod(xw,wwi)
t2=B21%*%xw #I have problem of using crossprod and tcrossprod here
B22 <- xx - t2
#B22 can a scaler (A model) or 2 by2 matrix (AD model)
if(model=="AD"&r<=threshold){
invB22 <- FarmCPU.Inv(B22)
}else{
invB22=1/B22
}
NeginvB22B21 <- crossprod(-invB22,B21)
if(model=="AD"&r<=threshold){
B11 <- wwi + crossprod(B21,B21)
}else{
B11 <- wwi + as.numeric(invB22)*crossprod(B21,B21)
}
#Derive inverse of LHS with partationed matrix
iXX[1:q0,1:q0]=B11
if(r>threshold){
iXX[q1,q1]=invB22
iXX[q1,1:q0]=NeginvB22B21
iXX[1:q0,q1]=NeginvB22B21
}else{
iXX[q2,q2]=invB22
iXX[q2,1:q0]=NeginvB22B21
iXX[1:q0,q2]=NeginvB22B21
}
#statistics
rhs=c(wy,xy) #the size varied automaticly by A/AD model and validated d
if(abs(r)>threshold & model=="AD"){
beta <- crossprod(iXX[-(q0+k),-(q0+k)],rhs) #the last one (d) dose not count
df=n-q0-1
}else{
beta <- crossprod(iXX,rhs) #both a and d go in
df=n-q0-2
}
if(model=="A") df=n-q0-1 #change it back for model A
ve=(yy-crossprod(beta,rhs))/df #this is a scaler
#using iXX in the same as above to derive se
if(abs(r)>threshold & model=="AD"){
se=sqrt(diag(iXX[-(q0+k),-(q0+k)])*ve)
}else{
se=sqrt(diag(iXX)*ve)
}
tvalue=beta/se
pvalue <- 2 * pt(abs(tvalue), df,lower.tail = FALSE)
#Handler of dependency between marker are covariate
#if(abs(B22[1,1])<10e-8)pvalue[]=NA
#Calculate P value for A+D effect
if(model=="AD"){
#the last bit could be d or a, the second last may be marker effect not even not
#In either case, calculate F and P value and correct them later
markerbits=(length(beta)-1):length(beta)
SSM=crossprod(beta[markerbits],rhs[markerbits])
F=(SSM/2)/ve
PF=df(F,2,df)
#correcting PF with P from t value
if(r>threshold) PF=pvalue[length(pvalue)]
}
#in case AD model and a/d dependent, add NA column at end
if(r>threshold & model=="AD"){
beta=c(beta,NA)
tvalue=c(tvalue,NA)
se=c(se,NA)
pvalue=c(pvalue,NA)
}
if(model=="AD"){
result=c(beta[-1],tvalue[-1],se[-1],pvalue[-1],PF)
}else{
result=c(beta[-1],tvalue[-1],se[-1],pvalue[-1])
}
}) #end of defyning apply function
#sfStop()
print("iteration accoplished")
print(date())
print("Memory used after iteration")
print(memory.size())
gc()
#Final report
#---------------------------------------------------------
P=t(as.matrix(P))
PF=P[,ncol(P)]
if(model=="AD")P=P[,-ncol(P)]
print("FarmCPU.LM accoplished")
print(date())
print("Memory used at end of LM")
print(memory.size())
gc()
return(list(P=P,P0=P0,PF=PF,Pred=yp))
}
#)#end of cmpfun(
`FarmCPU.LM` <-
#cmpfun(
function(y,w=NULL,GDP,orientation="col",model="A",ncpus=2,myModel=NULL,seqQTN=NULL,npc=0){
#Object: 1. To quickly sovel LM with one variable substitute multiple times
#Object: 2. To fit additive and additive+dominace model
#intput: y - dependent variable
#intput: w - independent variable
#intput: GDP - independent variable of substitution (GDP)
#intput: model - genetic effects. Options are "A" and "AD"
#Output: estimate, tvalue, stderr and pvalue ( plus the P value of F test on both A and D)
#Straitegy: 1. Separate constant covariates (w) and dynamic coveriates (x)
#Straitegy: 2. Build non-x related only once
#Straitegy: 3. Use apply to iterate x
#Straitegy: 4. Derive dominance indicate d from additive indicate (x) mathmaticaly
#Straitegy: 5. When d is not estimable, continue to test x
#Authors: Xiaolei Liu and Zhiwu Zhang
#Start date: March 1, 2013
#Last update: March 6, 2013
##############################################################################################
#print("FarmCPU.LM started")
#print(date())
#print(paste("No. Obs: ",length(y),sep=""))
#print("diminsion of covariates and markers")
if(!is.null(w))#print(dim(w))
#print("Memory used at begining of LM")
#print(memory.size())
gc()
#Constant section (non individual marker specific)
#---------------------------------------------------------
#Configration
nd=20 #number of markes for checking A and D dependency
threshold=.99 # not solving d if correlation between a and d is above this
N=length(y) #Total number of taxa, including missing ones
direction=2
if(orientation=="row")direction=1
#print("direction")
#print(direction)
#Handler of non numerical y a and w
if(!is.null(w)){
nf=length(w)/N
w=matrix(as.numeric(as.matrix(w)),N,nf )
w=cbind(rep(1,N),w)#add overall mean indicator
q0=ncol(w) #Number of fixed effect excluding gnetic effects
}else{
w=rep(1,N)
nf=0
q0=1
}
y=matrix(as.numeric(as.matrix(y)),N,1 )
#print("Adding overall mean")
#print(date())
#print("Build the static section")
#print(date())
#n=nrow(w) #number of taxa without missing
n=N
if(nd>n)nd=n #handler of samples less than nd
k=1 #number of genetic effect: 1 and 2 for A and AD respectively
if(model=="AD")k=2
q1=(q0+1) # vecter index for the posistion of genetic effect (a)
q2=(q0+1):(q0+2) # vecter index for the posistion of genetic effect (a and d)
df=n-q0-k #residual df (this should be varied based on validating d)
iXX=matrix(0,q0+k,q0+k) #Reserve the maximum size of inverse of LHS
#theNA=c(rep(NA,q0),rep(0,k)) # this should not be useful anymore
ww=crossprod(w,w)
wy=crossprod(w,y)
yy=crossprod(y,y)
wwi=solve(ww)
#print("Prediction")
#print(date())
#Statistics on the reduced model without marker
rhs=wy
beta <- crossprod(wwi,rhs)
ve=(yy-crossprod(beta,rhs))/df
se=sqrt(diag(wwi)*ve)
tvalue=beta/se
pvalue <- 2 * pt(abs(tvalue), df,lower.tail = FALSE)
P0=c(beta[-1],tvalue[-1],se[-1],pvalue[-1])
yp=w%*%beta
if(npc!=0){
betapc = beta[2:(npc+1)]
betapred = beta[-c(1:(npc+1))]
}else{
betapc = NULL
betapred = beta[-1]
}
#print("Detecting genotype coding system")
#print(date())
#Finding the middle of genotype coding (1 for 0/1/2 and 0 for -1/0/1)
s=5 # number of taxa sampled
t0=which(GDP[1:s,]<0)
t1=which(GDP[1:s,]>1)
middle=0
if(length(t0)<length(t1)) middle=1
#print("Memory used after setting LM")
#print(memory.size())
gc()
#Dynamic section on individual marker
#print("Iterating.................")
#print(date())
#print("dimension of GD")
#print(dim(x))
#sfInit(parallel=ncpus>1, cpus=ncpus)
##print(sprintf('%s cpus are used', sfCpus()))
#---------------------------------------------------------
#P <- matrix(NA,nrow=nrow(GDP),ncol=4*(nf+1))
if(orientation=="row"){
P <- matrix(NA,nrow=nrow(GDP),ncol=nf+1)
ntest=nrow(GDP)
}else{
P <- matrix(NA,nrow=ncol(GDP),ncol=nf+1)
ntest=ncol(GDP)
}
if(orientation=="row"){
B <- matrix(NA,nrow=nrow(GDP),ncol=1)
}else{
B <- matrix(NA,nrow=ncol(GDP),ncol=1)
}
for(i in 1:ntest){
if(orientation=="row"){
x=GDP[i,]
}else{
x=GDP[,i]
}
#P <- apply(x,direction,function(x){
#P <- sfApply(x,direction,function(x){
r=1 #initial creteria for correlation between a and d
if(model=="AD"){
d=1-abs(x-middle)
r=abs(cor(x[1:nd],d[1:nd]))
if(is.na(r))r=1
if(r<=threshold) x=cbind(x,d) # having both a and d as marker effects
}
#Process the edge (marker effects)
xy=crossprod(x,y)
xx=crossprod(x,x)
if(model=="AD"&r<=threshold){
xw=crossprod(x,w)
wx=crossprod(w,x)
iXX22 <- solve(xx-xw%*%wwi%*%wx)
iXX12 <- (-wwi)%*%wx%*%iXX22
iXX21 <- (-iXX22)%*%xw%*%wwi
iXX11 <- wwi + wwi%*%wx%*%iXX22%*%xw%*%wwi
}else{
xw=crossprod(w,x)
B21 <- crossprod(xw, wwi)
t2=B21%*%xw #I have problem of using crossprod and tcrossprod here
B22 <- xx - t2
invB22=1/B22
NeginvB22B21 <- crossprod(-invB22,B21)
iXX11 <- wwi + as.numeric(invB22)*crossprod(B21,B21)
}
#Derive inverse of LHS with partationed matrix
iXX[1:q0,1:q0]=iXX11
if(r>threshold){
iXX[q1,q1]=invB22
iXX[q1,1:q0]=NeginvB22B21
iXX[1:q0,q1]=NeginvB22B21
}else{
iXX[q2,q2]=iXX22
iXX[q2,1:q0]=iXX21
iXX[1:q0,q2]=iXX12
}
#statistics
rhs=c(wy,xy) #the size varied automaticly by A/AD model and validated d
if(abs(r)>threshold & model=="AD"){
beta <- crossprod(iXX[-(q0+k),-(q0+k)],rhs) #the last one (d) dose not count
df=n-q0-1
}else{
beta <- crossprod(iXX,rhs) #both a and d go in
df=n-q0-2
}
if(model=="A") df=n-q0-1 #change it back for model A
ve=(yy-crossprod(beta,rhs))/df #this is a scaler
#using iXX in the same as above to derive se
if(abs(r)>threshold & model=="AD"){
se=sqrt(diag(iXX[-(q0+k),-(q0+k)])*ve)
}else{
se=sqrt(diag(iXX)*ve)
}
tvalue=beta/se
pvalue <- 2 * pt(abs(tvalue), df,lower.tail = FALSE)
#Handler of dependency between marker are covariate
if(!is.na(abs(B22[1,1]))){
if(abs(B22[1,1])<10e-8)pvalue[]=NA}
#Calculate P value for A+D effect
if(model=="AD"){
#the last bit could be d or a, the second last may be marker effect not even not
#In either case, calculate F and P value and correct them later
markerbits=(length(beta)-1):length(beta)
SSM=crossprod(beta[markerbits],rhs[markerbits])
F=(SSM/2)/ve
PF=df(F,2,df)
#correcting PF with P from t value
if(r>threshold) PF=pvalue[length(pvalue)]
}
#in case AD model and a/d dependent, add NA column at end
if(r>threshold & model=="AD"){
beta=c(beta,NA)
tvalue=c(tvalue,NA)
se=c(se,NA)
pvalue=c(pvalue,NA)
}
if(model=="AD"){
result=c(beta[-1],tvalue[-1],se[-1],pvalue[-1],PF)
}else{
#result=c(beta[-1],tvalue[-1],se[-1],pvalue[-1])
#P[i,]=c(beta[-1],tvalue[-1],se[-1],pvalue[-1])
P[i,c(1:(nf+1))]=pvalue[-1]
B[i,]=beta[length(beta)]
#P[i,c(1:(nf+1))]=beta[-1]
#P[i,c((nf+2):(2*nf+2))]=pvalue[-1]
#P[i,c((nf+2):(2*nf+2))]=tvalue[-1]
#P[i,c((2*nf+3):(3*nf+3))]=se[-1]
#P[i,c((3*nf+4):(4*nf+4))]=pvalue[-1]
}
}
#}
#}) #end of defyning apply function
#sfStop()
#print("iteration accoplished")
#print(date())
#print("Memory used after iteration")
#print(memory.size())
gc()
#Final report
#---------------------------------------------------------
#P=t(as.matrix(P))
#P=as.matrix(P)
PF=P[,ncol(P)]
if(model=="AD")P=P[,-ncol(P)]
#print("FarmCPU.LM accoplished")
#print(date())
#print(dim(P))
#print(P[1:5,])
#print("Memory used at end of LM")
#print(memory.size())
gc()
#print(head(P))
return(list(P=P,P0=P0,PF=PF,Pred=yp,betapc=betapc,betapred=betapred,B=B))
} #end of function(
#)#end of cmpfun(
`FarmCPU.Pred` <- function(pred=NULL,ypred=NULL,name.of.trait=""){
#Object: To display the correlation between observed phenotype and predicted phenotype
#Input 1: pred, the first column is taxa name, the second column is observed phenotype and the third column is predicted phenotype
#Input 2: ypred, the first column is taxa name, the second column is observed phenotype and the third column is predicted phenotype, the different between pred and ypred is that pred is to predict phenotypes with observed values already, ypred is to predict phenotype that is NA
#Output: cor:correlation between observed phenotype and real phenotype (comment: pred is to predict phenotypes with observed values already)
#Output: ycor:correlation between observed phenotype and real phenotype (comment: ypred is to predict phenotype that is NA)
#Output: A table and plot (pdf)
#Requirment: NA
#Authors: Xiaolei Liu
#Start date: June 26, 2014
#Last update: June 26, 2014
##############################################################################################
#print("Create prediction table..." )
cor=NA
ycor=NA
if(!is.null(pred)) {
index=!is.na(pred[,2])
write.table(pred, paste("FarmCPU.", name.of.trait, ".Pred.csv", sep = ""), quote = FALSE, sep = ",", row.names = FALSE,col.names = TRUE)
#pred=read.table("FarmCPU.Iteration_02.Farm-CPU.Sim1.Pred.csv",sep=",",header=T)
pdf(paste("FarmCPU.", name.of.trait,".Pred.pdf" ,sep = ""), width = 5,height=5)
par(mar = c(5,6,5,3))
pred.lm = lm(pred[,3][index]~pred[,2][index])
plot(pred[,3][index]~pred[,2][index],pch=20,col='black',ylab="Predicted phenotype",xlab="Observed phenotype",cex.axis=1,cex=1,cex.lab=1,las=1,bty='n',xlim=c(floor(min(pred[,2],na.rm=T)),ceiling(max(pred[,2],na.rm=T))*1.2),ylim=c(floor(min(pred[,3],na.rm=T)),ceiling(max(pred[,3],na.rm=T))*1.2),xaxs="i",yaxs="i")
abline(pred.lm,lty=5,col='red',lwd=2)
#legend(max(pred[,3])+1,max(pred[,2])+1, paste("R^2 = ", 0.5), col = 'black', text.col = "black", lty = 1, ncol=1, cex = 1, lwd=2, bty='o')
cor=round(summary(pred.lm)$r.sq, 3)
text(max(pred[,2],na.rm=T)*1, max(pred[,3],na.rm=T)*1, paste("R^2=", cor), col= "forestgreen", cex = 1, pos=3)
#title(paste("R^2 = ", round(summary(pred.lm)$r.sq, 3)), col= "black", cex = 1)
dev.off()
}
#print("Create prediction table for unknown phenotype...")
if(!is.null(ypred)){
yindex=!is.na(ypred[,2])
ypredrna=ypred[,2][yindex]
write.table(ypred, paste("FarmCPU.", name.of.trait, ".unknownY.Pred.csv", sep = ""), quote = FALSE, sep = ",", row.names = FALSE,col.names = TRUE)
if(length(ypredrna)!=0){
pdf(paste("FarmCPU.", name.of.trait,".unknownY.Pred.pdf" ,sep = ""), width = 5,height=5)
par(mar = c(5,6,5,3))
ypred.lm = lm(ypred[,3][yindex]~ypredrna)
plot(ypred[,3][yindex]~ypredrna,pch=20,col='black',ylab="Predicted phenotype",xlab="Observed phenotype",cex.axis=1,cex=1,cex.lab=1,las=1,bty='n',xlim=c(floor(min(pred[,2],na.rm=T)),ceiling(max(ypred[,2],na.rm=T))*1.2),ylim=c(floor(min(pred[,3],na.rm=T)),ceiling(max(ypred[,3],na.rm=T))*1.2),xaxs="i",yaxs="i")
abline(ypred.lm,lty=5,col='red',lwd=2)
ycor=round(summary(ypred.lm)$r.sq, 3)
text(max(ypred[,2],na.rm=T)*1,max(ypred[,3],na.rm=T)*1, paste("R^2=", ycor), col= "forestgreen", cex = 1, pos=3)
dev.off()
}else{
print("There is no observed phenotype for predicted phenotype")
}
}
return(list(cor=cor,ycor=ycor))
}#end of `FarmCPU.Pred`
`FarmCPU.Prior` <-
function(GM,P=NULL,Prior=NULL,kinship.algorithm="FARM-CPU"){
#Object: Set prior on existing p value
#Input: GM - m by 3 matrix for SNP name, chromosome and BP
#Input: Prior - s by 4 matrix for SNP name, chromosome, BP and Pvalue
#Input: P - m by 1 matrix containing probability
#Requirement: P and GM are in the same order, Prior is part of GM except P value
#Output: P - m by 1 matrix containing probability
#Authors: Zhiwu Zhang
# Last update: March 10, 2013
##############################################################################
#print("FarmCPU.Prior Started")
#print("dimension of GM")
#print(dim(GM))
if(is.null(Prior)& kinship.algorithm!="FARM-CPU")return(P)
if(is.null(Prior)& is.null(P))return(P)
#get prior position
if(!is.null(Prior)) index=match(Prior[,1],GM[,1],nomatch = 0)
#if(is.null(P)) P=runif(nrow(GM)) #set random p value if not provided (This is not helpful)
#print("debug set prior a")
#Get product with prior if provided
if(!is.null(Prior) & !is.null(P) )P[index]=P[index]*Prior[,4]
#print("debug set prior b")
return(P)
}#The function FarmCPU.Prior ends here
`FarmCPU` <-
function(Y=NULL,GD=NULL,GM=NULL,CV=NULL,GP=NULL,Yt=NULL,DPP=1000000,kinship.algorithm="FARM-CPU",file.output=TRUE,cutOff=0.01,method.GLM="FarmCPU.LM",method.sub="reward",method.sub.final="reward",method.bin="static",bin.size=c(5e5,5e6,5e7),bin.selection=seq(10,100,10),
memo=NULL,Prior=NULL,ncpus=1,maxLoop=10,threshold.output=.01,
WS=c(1e0,1e3,1e4,1e5,1e6,1e7),alpha=c(.01,.05,.1,.2,.3,.4,.5,.6,.7,.8,.9,1),maxOut=100,QTN.position=NULL,
converge=1,iteration.output=FALSE,acceleration=0,model="A",MAF.calculate=FALSE,plot.style="FarmCPU",p.threshold=NA,QTN.threshold=0.01,maf.threshold=0.03,ycor=NULL,bound=NULL){
#Object: GWAS and GS by using FarmCPU method
#Input: Y,GD,GM,CV
#Input: GD - n by m +1 dataframe or n by m big.matrix
#Input: GD - n by m matrix. This is Genotype Data Pure (GD). THERE IS NOT COLUMN FOR TAXA.
#Requirement: Y, GD and CV have same taxa order. GD and GM have the same order on SNP
#Requirement: Y can have missing data. CV, GD and GM can not. Non-variable markers are allowed
#Output: GWAS,GPS,Pred
#Authors: Xiaolei Liu and Zhiwu Zhang
# Date start: Febuary 24, 2013
# Last update: April 2, 2013
##############################################################################################
#print("FarmCPU Started")
#print(date())
#print("Memory used at begining of BUS")
#print(memory.size())
#print(dim(GD))
#print(dim(GM))
print("--------------------- Welcome to FarmCPU ----------------------------")
echo=TRUE
FarmCPU.Version=FarmCPU.0000()
print("FarmCPU Started...")
if(ncol(Y)>2) stop("FarmCPU only accept single phenotype, please specify a column, like myY[,c(1,3)]")
#Set orientation
#Strategy: the number of rows in GD and GM are the same if GD has SNP as row
nm=nrow(GM)
ny=nrow(Y)
ngd1=nrow(GD)
ngd2=ncol(GD)
if(!is.null(CV)){
CV=as.matrix(CV)
npc=ncol(CV)
}else{
npc=0
}
ngd1=abs(ngd1-nm)
ngd2=abs(ngd2-nm)
orientation="col"
theSNP=2
ns=nrow(GD)
if(min(ngd1,ngd2)==0){
orientation="row"
theSNP=1
ns=ncol(GD)
}
#acceleration
ac=NULL
if(acceleration!=0) ac=rep(1.0,nm)
#Handler of non numeric chr
#GM[,2]=as.numeric(GM[,2])
#Handler 0 bp
index=which(GM[,3]==0 )
if(length(index)>0){
#print("Warning: there is 0 bp which was set to 1")
#print(length(index))
GM[index,3]=1 #This is problematic
}
#handler of multiple CPU on big.matrix
if(ncpus>1 & is.big.matrix(GD)){
#print("Multiple CPUs are not avaiable for big.matrix. ")
#print("The big.matrix will be converted to regular matrix which takes more memmory")
#stop("Import the genotype as regula R matrix or set single CPU")
}
#print("number of CPU required")
#print(ncpus)
if(ncpus>1) sfInit(parallel=ncpus>1, cpus=ncpus)
P=GP
if(!is.null(GP))P=GP[,4] #get the p value
#print("maxLoop")
#print(maxLoop)
gc()
#print(memory.size())
#print(date())
#print(is(GD))
#print(dim(GD))
#handler of GD with taxa column
if(ncol(GD)>nm & orientation=="col"){
#print("GD has taxa column")
if(is.big.matrix(GD)){
#retain as bi.matrix
GD=deepcopy(GD,rows=1:nrow(GD),cols=2:ncol(GD)) #This cause problem with multi cpu
}else{
GD=as.matrix(GD[,-1])
}
}#end of if(ncol...
#Change to regula matrix for multiple CPUs
if(ncpus>1) GD=as.matrix(GD)
#print("after remove taxa in GD")
gc()
#print(memory.size())
#print(date())
#print(is(GD))
#print(dim(GD))
if(model=="A"){
shift=0
}else if(model=="AD"){
shift=1
}else {
print("Please choose 'A' model or 'AD' model")
}
#print("bin.selection")
#print(bin.selection)
#calculating MAF
if(MAF.calculate==FALSE){
MAF=NA
}else{
MAF=apply(GD,theSNP,mean)
MAF=matrix(MAF,nrow=1)
MAF=apply(MAF,2,function(x) min(1-x/2,x/2))
}
for (trait in 2: ncol(Y)) {
name.of.trait=colnames(Y)[trait]
#print(paste("Processing trait: ",name.of.trait,sep=""))
if(!is.null(memo)) name.of.trait=paste(memo,".",name.of.trait,sep="")
#===============================================================================
#handler of missing phenotype (keep raw Y,CV and GD)
#print(date())
#print("Memory used before processing missing")
#print(memory.size())
#index for missing phenotype
index=1:nm
seqTaxa=which(!is.na(Y[,trait]))
if(MAF.calculate==TRUE){
if(is.na(maf.threshold)){
if(length(seqTaxa)<=100) maf.threshold=0.05
#if(length(seqTaxa)>100&&length(seqTaxa)<=500) maf.threshold=0.01
#if(length(seqTaxa)>300&&length(seqTaxa)<=500) maf.threshold=0.05
#if(length(seqTaxa)>500&&length(seqTaxa)<=1000) maf.threshold=0.01
if(length(seqTaxa)>100) maf.threshold=0
}else{
maf.threshold=maf.threshold
}
mafindex=(1:nm)[MAF>=maf.threshold]
MAF=MAF[mafindex]
index=mafindex
GM=GM[index,]
nm=length(index)
}
#predict = !(length(seqTaxa)==nrow(Y))#judge whether there is NA in phenotype
predict = !is.null(Yt)#judge whether there is two phenotypes
PredictYt = NULL
ypred = NULL
#print(length(seqTaxa))
#print(nrow(Y))
#print("predict")
#print(predict)
Y1=Y[seqTaxa,]
#if(is.numeric(CV)){CV1=CV[seqTaxa]
#}else{
# CV1=CV[seqTaxa,]}
CV1=CV[seqTaxa,]
#print(head(CV1))
if(length(seqTaxa)<1) stop("FarmCPU stoped as no data in Y")
#print("Extract genotype for phenotyped taxa")
#print(memory.size())
#print(is(GD))
#print(dim(GD))
#print(length(seqTaxa))
#print(length(index))
#GD based on big.matrix and orientation
if(orientation=="col"){
if(is.big.matrix(GD)){
GD1=deepcopy(GD,rows=seqTaxa,cols=index)
}else{
GD1=GD[seqTaxa,index]
}
}else{
if(is.big.matrix(GD)){
GD1=deepcopy(GD,rows=index,cols=seqTaxa)
}else{
GD1=GD[index,seqTaxa]
}
}# end of if orientation
#prepare the data for predict NA in phenotype
if(predict){
seqTaxa2=which(is.na(Y[,trait]))
#seqTaxa2=which(is.na(Yt[,trait]))
#Y2=Yt[seqTaxa2,]
PredictYt=Yt[seqTaxa2,]
if(is.numeric(CV)){CV2=CV[seqTaxa2]
}else{
CV2=CV[seqTaxa2,]}
#GD based on big.matrix and orientation
if(orientation=="col"){
if(is.big.matrix(GD)){
GD2=deepcopy(GD,rows=seqTaxa2,cols=index)
}else{
GD2=GD[seqTaxa2,index]
}
}else{
if(is.big.matrix(GD)){
GD2=deepcopy(GD,rows=index,cols=seqTaxa2)
}else{
GD2=GD[index,seqTaxa2]
}
}# end of if orientation
}
#print("dim(GD2)")
#print(dim(GD2))
#Step 1: preliminary screening
#print(date())
#print("Memory used before 1st GLM")
#print(memory.size())
theLoop=0
theConverge=0
seqQTN.save=c(0)
seqQTN.pre=c(-1)
isDone=FALSE
name.of.trait2=name.of.trait
#while(theLoop<maxLoop & !converge ) {
while(!isDone) {
theLoop=theLoop+1
print(paste("Current loop: ",theLoop," out of maximum of ", maxLoop, sep=""))
#print(date())
spacer="0"
if(theLoop>9)spacer=""
if(iteration.output) name.of.trait2=paste("Iteration_",spacer,theLoop,".",name.of.trait,sep="")
if(method.bin=="NONE")maxLoop=1 #force to exit for GLM model
#Step 2a: Set prior
#print("Memory used before Prior")
#print(memory.size())
myPrior=FarmCPU.Prior(GM=GM,P=P,Prior=Prior,kinship.algorithm=kinship.algorithm)
#Step 2b: Set bins
#print(myPrior[1:5])
#print("Memory used before Bin")
#print(memory.size())
#print(date())
if(theLoop<=2){
myBin=FarmCPU.BIN(Y=Y1[,c(1,trait)],GD=GD1,GM=GM,CV=CV1,orientation=orientation,P=myPrior,method=method.bin,b=bin.size,s=bin.selection,theLoop=theLoop,bound=bound)
}else{
myBin=FarmCPU.BIN(Y=Y1[,c(1,trait)],GD=GD1,GM=GM,CV=theCV,orientation=orientation,P=myPrior,method=method.bin,b=bin.size,s=bin.selection,theLoop=theLoop)
}
#Step 2c: Remove bin dependency
#print(date())
#print("Memory used before Remove")
#print(memory.size())
#Remove QTNs in LD
seqQTN=myBin$seqQTN
ve.save=myBin$ve.save
vg.save=myBin$vg.save
#print(seqQTN)
#if(theLoop==2&&is.null(seqQTN)){maxLoop=2}#force to exit for GLM model while seqQTN=NULL and h2=0
if(theLoop==2){
#print(head(P))
#print(min(P,na.rm=TRUE))
if(!is.na(p.threshold)){
if(min(myPrior,na.rm=TRUE)>p.threshold){
seqQTN=NULL
print("Top snps have little effect, set seqQTN to NULL!")
#print("**********FarmCPU ACCOMPLISHED**********")
}
}else{
if(min(myPrior,na.rm=TRUE)>0.01/nm){
seqQTN=NULL
print("Top snps have little effect, set seqQTN to NULL!")
#print("**********FarmCPU ACCOMPLISHED**********")
}
}
}
#when FARM-CPU can not work, make a new QQ plot and manhatthan plot
if(theLoop==2&&is.null(seqQTN)){
#Report
GWAS=cbind(GM,P,MAF,myGLM$B)
#if(isDone | iteration.output){
gc()
pred=myGLM$Pred
#print(pred)
if(!is.null(pred)) pred=cbind(Y1,myGLM$Pred) #Need to be consistant to CMLM
#print(pred)
p.GLM=GWAS[,4]
p.GLM.log=-log10(quantile(p.GLM,na.rm=TRUE,0.05))
#set.seed(666)
#bonf.log=-log10(quantile(runif(nm),0.05))
bonf.log=1.3
bonf.compare=p.GLM.log/bonf.log
p.FARMCPU.log=-log10(p.GLM)/bonf.compare
GWAS[,4]=10^(-p.FARMCPU.log)
GWAS[,4][which(GWAS[,4]>1)]=1
#colnames(GWAS)=c(colnames(GM),"P.value","maf","nobs","Rsquare.of.Model.without.SNP","Rsquare.of.Model.with.SNP","FDR_Adjusted_P-values")
colnames(GWAS)=c(colnames(GM),"P.value","maf","effect")
Vp=var(Y1[,2],na.rm=TRUE)
#print("Calling Report..")
if(file.output){
if(npc!=0){
betapc=cbind(c(1:npc),myGLM$betapc)
colnames(betapc)=c("CV","Effect")
write.csv(betapc,paste("FarmCPU.",name.of.trait2,".CVeffect.csv",sep=""),quote=F,row.names=FALSE)
}
GAPIT.Report(name.of.trait=name.of.trait2,GWAS=GWAS,pred=NULL,ypred=ypred,tvalue=NULL,stderr=stderr,Vp=Vp,DPP=DPP,cutOff=cutOff,threshold.output=threshold.output,MAF=MAF,seqQTN=QTN.position,MAF.calculate=MAF.calculate,plot.style=plot.style)
myPower=GAPIT.Power(WS=WS, alpha=alpha, maxOut=maxOut,seqQTN=QTN.position,GM=GM,GWAS=GWAS,MaxBP=1e10)
}
#} #enf of is done
break
}#force to exit for GLM model while seqQTN=NULL and h2=0
#print("debug seqQTN")
#print(seqQTN)
#print(seqQTN.save)
if(!is.null(seqQTN.save)&&theLoop>1){
if(seqQTN.save!=0 & seqQTN.save!=-1 & !is.null(seqQTN) ) seqQTN=union(seqQTN,seqQTN.save) #Force previous QTNs in the model
#print("**********POSSIBLE QTNs combined**********")
}
#if(!is.null(seqQTN.save)){
#if(theLoop>=4 && !is.null(seqQTN.save) && (length(intersect(seqQTN.pre,seqQTN))/length(union(seqQTN.pre,seqQTN)))==1){
#if(seqQTN.save!=0 & seqQTN.save!=-1 & !is.null(seqQTN) )
#{seqQTN=union(seqQTN,seqQTN.save) #Force previous QTNs in the model
#}
if(theLoop!=1){
seqQTN.p=myPrior[seqQTN]
if(theLoop==2){
#index.p=seqQTN.p<0.01/nm
index.p=seqQTN.p<QTN.threshold
if(!is.na(p.threshold)){
#index.p=seqQTN.p<p.threshold
index.p=seqQTN.p<QTN.threshold
}
seqQTN.p=seqQTN.p[index.p]
seqQTN=seqQTN[index.p]
seqQTN.p=seqQTN.p[!is.na(seqQTN)]
seqQTN=seqQTN[!is.na(seqQTN)]
}else{
#print("seqQTN.save")
#print(seqQTN.save)
#print("seqQTN")
#print(seqQTN)
#print(length(seqQTN.save))
#print(seqQTN.p[1:length(seqQTN.save)]<1)
#print(seqQTN.p[(length(seqQTN.save)+1):length(seqQTN)]<0.01/nm)
#index.p=seqQTN.p<(0.01/nm)
index.p=seqQTN.p<QTN.threshold
if(!is.na(p.threshold)){
#index.p=seqQTN.p<p.threshold
index.p=seqQTN.p<QTN.threshold
}
index.p[seqQTN%in%seqQTN.save]=TRUE
#print(index.p)
seqQTN.p=seqQTN.p[index.p]
seqQTN=seqQTN[index.p]
seqQTN.p=seqQTN.p[!is.na(seqQTN)]
seqQTN=seqQTN[!is.na(seqQTN)]
}
}
myRemove=FarmCPU.Remove(GD=GD1,GM=GM,seqQTN=seqQTN,seqQTN.p=seqQTN.p,orientation=orientation,threshold=.7)
#Recoding QTNs history
seqQTN=myRemove$seqQTN
#if(length(setdiff(seqQTN,seqQTN.save))==0 & length(intersect(seqQTN,seqQTN.save))>0 ) converge=TRUE
theConverge=length(intersect(seqQTN,seqQTN.save))/length(union(seqQTN,seqQTN.save))
circle=(length(union(seqQTN,seqQTN.pre))==length(intersect(seqQTN,seqQTN.pre)) )
#handler of initial status
if(is.null(seqQTN.pre)){circle=FALSE
}else{
if(seqQTN.pre[1]==0) circle=FALSE
if(seqQTN.pre[1]==-1) circle=FALSE
}
#print("circle objective")
print("seqQTN")
print(seqQTN)
print("scanning...")
if(theLoop==maxLoop){
print(paste("Total number of possible QTNs in the model is: ", length(seqQTN),sep=""))
}
#print(seqQTN.save)
#print(seqQTN.pre)
#print(circle)
#print(converge)
#print("converge current")
#print(theConverge)
isDone=((theLoop>=maxLoop) | (theConverge>=converge) |circle )
seqQTN.pre=seqQTN.save
seqQTN.save=seqQTN
#myRemove=FarmCPU.Remove(GD=GD1,GM=GM,seqQTN=seqQTN,orientation=orientation,threshold=.7)
#Step 3: Screen with bins
rm(myBin)
gc()
#print(date())
#print("Memory used before 2nd GLM")
#print(memory.size())
theCV=CV1
if(!is.null(myRemove$bin)){
if(theLoop==1){
theCV=cbind(CV1,myRemove$bin)
}else{
#print("remove PCs since 2nd iteration")
theCV=cbind(CV1,myRemove$bin)
#theCV=myRemove$bin
}
}
myGLM=FarmCPU.GLM(Y=Y1[,c(1,trait)],GDP=GD1,GM=GM,CV=theCV,orientation=orientation,package=method.GLM,ncpus=ncpus,model=model,seqQTN=seqQTN,npc=npc)
#Step 4: Background unit substitution
#print(date())
#print("Memory used before SUB")
#print(memory.size())
#print("After calling SUB")
#How about having reward during the process and mean at end?
if(!isDone){
myGLM=FarmCPU.SUB(GM=GM,GLM=myGLM,QTN=GM[myRemove$seqQTN,],method=method.sub,model=model)
}else{
myGLM=FarmCPU.SUB(GM=GM,GLM=myGLM,QTN=GM[myRemove$seqQTN,],method=method.sub.final,model=model)
}
#print(date())
P=myGLM$P[,ncol(myGLM$P)-shift]
#acceleration
if(!is.null(ac)){
ac=FarmCPU.Accelerate(ac=ac,QTN=myRemove$seqQTN,acceleration=acceleration)
P=P/ac
}
#print("Acceleration in bus")
index=which(ac>1)
#print(cbind(index,ac[index],P[index]))
#if P value is 0
#if(min(P,na.rm=TRUE)==0) break
P[P==0] <- min(P[P!=0],na.rm=TRUE)*0.01
#Report
if(isDone | iteration.output){
#print("Report assemmbling...")
#-------------------------------------------------------------------------------
#Assemble result for report
gc()
pred=myGLM$Pred
PredictY=NULL
if(!is.null(theCV)&&predict){
#Statistics on the reduced model without marker
beta <- myGLM$betapred
#w=seqQTN
if(orientation=="row"){
predw=rbind(1,t(CV1),GD2[seqQTN,])
}else{
predw=cbind(1,CV1,GD2[,seqQTN])
}
#ypred=predw%*%beta
#if(!is.null(theCV)){
#nf=length(theCV)/length(seqTaxa2)
#theCV=matrix(as.numeric(as.matrix(theCV)),length(seqTaxa2),nf)
#predw=cbind(rep(1,length(seqTaxa2)),theCV)#add overall mean indicator
#}else{
#predw=rep(1,length(seqTaxa2))
#}
#print(dim(predw))
#print(predw)
#print(length(beta))
#print(beta)
PredictY=predw%*%beta
#print(PredictY)
#PredictYt[seqTaxa2,]=PredictY
}
if(!is.null(pred)) pred=cbind(Y1,myGLM$Pred) #Need to be consistant to CMLM
if(!is.null(PredictY)) ypred=cbind(PredictYt,PredictY) #Need to be consistant to CMLM
#P=myGLM$P[,ncol(myGLM$P)-shift]
#myGLM$P is in order of estimate, tvalue, stderr and pvalue
#nf=ncol(myGLM$P)/4
#tvalue=myGLM$P[,nf*2-shift]
#stderr=myGLM$P[,3*nf-shift]
#print("MAF might cause problem")
#print(length(MAF))
#GWAS=cbind(GM,P,MAF,NA,NA,NA,NA)
GWAS=cbind(GM,P,MAF,myGLM$B)
#colnames(GWAS)=c(colnames(GM),"P.value","maf","nobs","Rsquare.of.Model.without.SNP","Rsquare.of.Model.with.SNP","FDR_Adjusted_P-values")
colnames(GWAS)=c(colnames(GM),"P.value","maf","effect")
Vp=var(Y1[,2],na.rm=TRUE)
if(!is.null(ypred)){
yindex=!is.na(ypred[,2])
ypredrna=ypred[,2][yindex]
ypred.lm = lm(ypred[,3][yindex]~ypredrna)
ycor=round(summary(ypred.lm)$r.sq, 3)
#print(ycor)
}
#print("Calling Report..")
if(file.output){
if(theLoop==1&&is.null(CV)){
if(npc!=0){
betapc=cbind(c(1:npc),myGLM$betapc)
colnames(betapc)=c("CV","Effect")
write.csv(betapc,paste("FarmCPU.",name.of.trait2,".CVeffect.csv",sep=""),quote=F,row.names=FALSE)
}
GAPIT.Report(name.of.trait=name.of.trait2,GWAS=GWAS,pred=NULL,ypred=NULL,tvalue=NULL,stderr=stderr,Vp=Vp,DPP=DPP,cutOff=cutOff,threshold.output=threshold.output,MAF=MAF,seqQTN=QTN.position,MAF.calculate=MAF.calculate,plot.style=plot.style)
}else{
if(npc!=0){
betapc=cbind(c(1:npc),myGLM$betapc)
colnames(betapc)=c("CV","Effect")
write.csv(betapc,paste("FarmCPU.",name.of.trait2,".CVeffect.csv",sep=""),quote=F,row.names=FALSE)
}
GAPIT.Report(name.of.trait=name.of.trait2,GWAS=GWAS,pred=NULL,ypred=ypred,tvalue=NULL,stderr=stderr,Vp=Vp,DPP=DPP,cutOff=cutOff,threshold.output=threshold.output,MAF=MAF,seqQTN=QTN.position,MAF.calculate=MAF.calculate,plot.style=plot.style)
}
}
#Evaluate Power vs FDR and type I error
#print("Calling Power..")
myPower=GAPIT.Power(WS=WS, alpha=alpha, maxOut=maxOut,seqQTN=QTN.position,GM=GM,GWAS=GWAS,MaxBP=1e10)
} #enf of is done
#if(length(seqQTN)==1) maxLoop=3
} #end of while loop
print("**********FarmCPU ACCOMPLISHED SUCCESSFULLY**********")
#print(name.of.trait)
#print("-----------------------------------------------------------------------")
#===============================================================================
}# end of loop on trait
if(ncpus>1)sfStop()
gc()
if(ncol(Y)==2) {
return (list(GWAS=GWAS,GPS=NULL,Pred=pred,compression=NULL,kinship.optimum=NULL,kinship=NULL,ycor=ycor,FDR=myPower$FDR,Power=myPower$Power,Power.Alpha=myPower$Power.Alpha,alpha=myPower$alpha,betapc=myGLM$betapc))
}else{
return (list(GWAS=NULL,GPS=NULL,Pred=NULL,compression=NULL,kinship.optimum=NULL,kinship=NULL))
}
}#The FarmCPU function ends here
`FarmCPU.Remove` <-
function(GDP=NULL,GM=NULL,seqQTN=NULL,seqQTN.p=NULL,orientation="col",threshold=.99){
#Objective: Remove bins that are highly correlated
#Input: GDP - n by m+1 matrix. The first colum is taxa name. The rest are m genotype
#Input: GM - m by 3 matrix for SNP name, chromosome and BP
#Input: seqQTN - s by 1 vecter for index of QTN on GM (+1 for GDP column wise)
#Requirement: GDP and GM have the same order on SNP
#Output: bin - n by s0 matrix of genotype
#Output: binmap - s0 by 3 matrix for map of bin
#Output: seqQTN - s0 by 1 vecter for index of QTN on GM (+1 for GDP column wise)
#Relationship: bin=GDP[,c(seqQTN)], binmap=GM[seqQTN,], s0<=s
#Authors: Zhiwu Zhang
# Last update: March 4, 2013
##############################################################################
#print("FarmCPU.Remove Started")
#print(date())
if(is.null(seqQTN))return(list(bin=NULL,binmap=NULL,seqQTN=NULL))
#remove seqQTN with unsignificant p values
#index.p=seqQTN.p<0.01
#seqQTN.p=seqQTN.p[index.p]
#seqQTN=seqQTN[index.p]
#sort seqQTN using p values
seqQTN=seqQTN[order(seqQTN.p)]
hugeNum=10e10
n=length(seqQTN)
#print("Number of bins and GDP")
#print(n)
#print(dim(GDP))
#print(seqQTN)
#fielter bins by physical location
binmap=GM[seqQTN,]
#print("binmap")
#print(binmap)
cb=as.numeric(binmap[,2])*hugeNum+as.numeric(binmap[,3])#create ID for chromosome and bp
cb.unique=unique(cb)
#print("debuge")
#print(cb)
#print(cb.unique)
index=match(cb.unique,cb,nomatch = 0)
seqQTN=seqQTN[index]
#print("Number of bins after chr and bp fillter")
n=length(seqQTN) #update n
#print(n)
#print(date())
#Set sample
ratio=.1
maxNum=100000
if(orientation=="col"){
s=nrow(GDP) #sample size
m=ncol(GDP) #number of markers
}else{
m=nrow(GDP) #sample size
s=ncol(GDP) #number of markers
}
#print("Determine number of samples")
#print(date())
#sampled=floor(ratio*s)
sampled=s
if(sampled>maxNum)sampled=maxNum
#print("Number of individuals sampled to test dependency of bins")
#print(sampled)
#index=sample(s,sampled)
index=1:sampled
#print("Get the samples")
#print(date())
#This section has problem of turning big.matrix to R matrix
#It is OK as x is small
if(orientation=="col"){
if(is.big.matrix(GDP)){
x=as.matrix(deepcopy(GDP,rows=index,cols=seqQTN) )
}else{
x=GDP[index,seqQTN]
}
}else{
if(is.big.matrix(GDP)){
x=t(as.matrix(deepcopy(GDP,rows=seqQTN,cols=index) ))
}else{
x=t(GDP[seqQTN,index] )
}
}# end of if orientation
#print("Calculating r")
#print(date())
#print("matrix x")
#print(is(x))
#print(dim(x))
#print(length(x))
#x=x[,order(seqQTN.p)]
#print("x")
#print(head(x))
r=cor(as.matrix(x))
#print("r")
#print(r)
#print("indexing r")
#print(date())
index=abs(r)>threshold
#print("index")
#print(index)
#print("Fancy algorithm")
#print(date())
#print("dimension of r")
#print(dim(r))
b=r*0
b[index]=1
c=1-b
#print("for loop")
#print(date())
#for(i in 1:(n-1)){
# for (j in (i+1):n){
# b[j,j]=b[j,j]*c[i,j]
# }
#}
#The above are replaced by following
c[lower.tri(c)]=1
diag(c)=1
bd <- apply(c,2,prod)
#print("Positioning...")
#print(date())
#position=diag(b)==1
position=(bd==1)
seqQTN=seqQTN[position]
#============================end of optimum============================================
seqQTN=seqQTN[!is.na(seqQTN)]
#print("Extract bin genotype data")
#print(date())
#This section has problem of turning big.matrix to R matrix
if(orientation=="col"){
if(is.big.matrix(GDP)){
bin=as.matrix(deepcopy(GDP,cols=seqQTN) )
}else{
bin=GDP[,seqQTN]
}
}else{
if(is.big.matrix(GDP)){
bin=t(as.matrix(deepcopy(GDP,rows=seqQTN,) ))
}else{
bin=t(GDP[seqQTN,] )
}
}# end of if orientation
#print("Get bin map")
#print(date())
binmap=GM[seqQTN,]
#print("Number of bins left:")
#print(length(seqQTN))
#print("FarmCPU.Remove accomplished successfully!")
return(list(bin=bin,binmap=binmap,seqQTN=seqQTN))
}#The function FarmCPU.Remove ends here
`FarmCPU.SUB` <-
function(GM=NULL,GLM=NULL,QTN=NULL,method="mean",useapply=TRUE,model="A"){
#Input: FarmCPU.GLM object
#Input: QTN - s by 3 matrix for SNP name, chromosome and BP
#Input: method - options are "penalty", "reward","mean","median",and "onsite"
#Requirement: P has row name of SNP. s<=t. covariates of QTNs are next to SNP
#Output: GLM with the last column of P updated by the substituded p values
#Authors: Xiaolei Liu and Zhiwu Zhang
# Last update: Febuary 26, 2013
##############################################################################
if(is.null(GLM$P)) return(NULL) #P is required
if(is.null(QTN)) return(NULL) #QTN is required
#print("FarmCPU.SUB Started")
#print("dimension of QTN")
#print(dim(QTN))
#print(length(QTN))
#print("debug")
#print(QTN)
#print(GLM)
#position=match(QTN[,1], rownames(GLM$P), nomatch = 0)
position=match(QTN[,1], GM[,1], nomatch = 0)
#position=(1:nrow(GM))[GM[,1]%in%QTN[,1]]
nqtn=length(position)
#print("Position of QTN on GM")
#print(length(position))
#print(position)
#get position of QTNs (last nqtn columns from the second last)
if(model=="A"){
index=(ncol(GLM$P)-nqtn):(ncol(GLM$P)-1)
spot=ncol(GLM$P)
}else{
index=(ncol(GLM$P)-nqtn-1):(ncol(GLM$P)-2)
spot=ncol(GLM$P)-1
}
#print("Position of P value of QTN")
#print(index)
#print("Position of P value of marker")
#print(spot)
#print('ok')
#print(ncol(GLM$P))
#print(nqtn)
#print((ncol(GLM$P)-nqtn))
#print((ncol(GLM$P)-1))
#print(min(GLM$P[,index],na.rm=TRUE))
#print(GLM$P[position,spot])
if(ncol(GLM$P)!=1){
if(length(index)>1){
if(method=="penalty") P.QTN=apply(GLM$P[,index],2,max,na.rm=TRUE)
if(method=="reward") P.QTN=apply(GLM$P[,index],2,min,na.rm=TRUE)
if(method=="mean") P.QTN=apply(GLM$P[,index],2,mean,na.rm=TRUE)
if(method=="median") P.QTN=apply(GLM$P[,index],2,median,na.rm=TRUE)
if(method=="onsite") P.QTN=GLM$P0[(length(GLM$P0)-nqtn+1):length(GLM$P0)]
}else{
if(method=="penalty") P.QTN=max(GLM$P[,index],na.rm=TRUE)
if(method=="reward") P.QTN=min(GLM$P[,index],na.rm=TRUE)
if(method=="mean") P.QTN=mean(GLM$P[,index],na.rm=TRUE)
if(method=="median") P.QTN=median(GLM$P[,index],median,na.rm=TRUE)
if(method=="onsite") P.QTN=GLM$P0[(length(GLM$P0)-nqtn+1):length(GLM$P0)]
}
#replace SNP pvalues with QTN pvalue
#print("Substituting...")
GLM$P[position,spot]=P.QTN
#print(position)
#print(GLM$betapred)
GLM$B[position,]=GLM$betapred
}
#write.table(P,file="debuger.csv",sep=",")
return(GLM)
}#The function FarmCPU.SUB ends here
`FarmCPU.P.Threshold` <-
function(GD=NULL,GM=NULL,Y=NULL,trait="",theRep=100){
#Input: GD - Genotype
#Input: GM - SNP name, chromosome and BP
#Input: Y - phenotype, 2 columns
#Input: trait - name of the trait
#Input: theRep - number of replicates
#Output: get minimum p value of each permutation and the recommend p.threshold used for FarmCPU model
#Authors: Xiaolei Liu
# Last update: April 6, 2015
##############################################################################
#theRep=theRep
#trait=trait
if(is.null(GD))return(NULL)
if(is.null(GM))return(NULL)
if(is.null(Y))return(NULL)
set.seed(12345)
i=1
for(i in 1:theRep){
index=1:nrow(Y)
index.shuffle=sample(index,length(index),replace=F)
Y.shuffle=Y
Y.shuffle[,2]=Y.shuffle[index.shuffle,2]
#GWAS with FarmCPU...
myFarmCPU=FarmCPU(
Y=Y.shuffle[,c(1,2)],#Phenotype
GD=GD,#Genotype
GM=GM,#Map information
file.output=FALSE,
method.bin="optimum", #options are "static" and "optimum", default is static and this gives the fastest speed. If you want to use random model to optimize possible QTNs selection, use method.bin="optimum"
maxLoop=1,#maxLoop is used to set the maximum iterations you want
iteration.output=TRUE,#iteration.output=TRUE means to output results of every iteration
)
pvalue=min(myFarmCPU$GWAS[,4],na.rm=T)
if(i==1){
pvalue.final=pvalue
}else{
pvalue.final=c(pvalue.final,pvalue)
}
}#end of theRep
write.table(pvalue.final,paste("FarmCPU.p.threshold.optimize.",trait,".txt",sep=""),sep="\t",col.names=F,quote=F,row.names=F)
print("The p.threshold of this data set should be:")
print(sort(pvalue.final)[ceiling(theRep*0.05)])
}#end of `FarmCPU.P.Threshold`
`FarmCPU.Burger` <-
function(Y=NULL,CV=NULL,GK=NULL){
#Object: To calculate likelihood, variances and ratio, revised by Xiaolei based on GAPIT.Burger function from GAPIT package
#Straitegy: NA
#Output: P value
#intput:
#Y: phenotype with columns of taxa,Y1,Y2...
#CV: covariate variables with columns of taxa,v1,v2...
#GK: Genotype data in numerical format, taxa goes to row and snp go to columns. the first column is taxa (same as GAPIT.bread)
#Authors: Xiaolei Liu ,Jiabo Wang and Zhiwu Zhang
#Last update: Dec 21, 2016
##############################################################################################
#print("FarmCPU.Burger in progress...")
if(!is.null(CV)){
CV=as.matrix(CV)#change CV to a matrix when it is a vector xiaolei changed here
theCV=as.matrix(cbind(matrix(1,nrow(CV),1),CV)) ###########for FarmCPU
}else{
theCV=matrix(1,nrow(Y),1)
}
#handler of single column GK
n=nrow(GK)
m=ncol(GK)
if(m>2){
theGK=as.matrix(GK)#GK is pure genotype matrix
}else{
theGK=matrix(GK,n,1)
}
myFaSTREML=GAPIT.get.LL(pheno=matrix(Y[,-1],nrow(Y),1),geno=NULL,snp.pool=theGK,X0=theCV)
REMLs=-2*myFaSTREML$LL
delta=myFaSTREML$delta
vg=myFaSTREML$vg
ve=myFaSTREML$ve
#print("FarmCPU.Burger succeed!")
return (list(REMLs=REMLs,vg=vg,ve=ve,delta=delta))
} #end of FarmCPU.Burger
#=============================================================================================
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.