R/utility_functions.R

#' A print method for ba objects
#' @param x an object of the class \code{ba} generated by bafit
#' @return Null. This function only prints to the screen.
#' @examples \dontrun{
#'  rm(list=ls())
#'  library(BATools)
#'  data("Pig")
#'  #Standardize genotype matrix
#   geno=std_geno(PigM,method="s",freq=PigAlleleFreq)
#   init=set_init("driploss",data=PigPheno,geno=geno,"id",df=5,pi_snp=0.001,h2=0.5,c=1000,model="SSVS",centered=TRUE)
#'  #or set your own starting values using 
#'  #init=list(df=5,scale=0.01,pi=1) 
#'  run_para=list(niter=2000,burnIn=1000,skip=10)
#'  print_mcmc=list(piter=500)
#'  update_para=list(df=FALSE,scale=TRUE,pi=F)
#'  op<-create.options(model="SSVS",method="MCMC",seed=1,priors=NULL,init=init,
#'                   update_para=update_para,run_para=run_para,save.at="SSVS",print_mcmc=print_mcmc)
#'  SSVS<-baFit(driploss~sex,data=PigPheno,geno=geno ,genoid = ~id,options = op,map=PigMap,GWA="Win")
#'  SSVS
#' }
#' @export
print.ba <- function(x,...) {    
    cat("Result of BATools: \n")    
    cat("\nestimated fixed effects:\n")
    print(x$bhat)
    if(!is.null(x$sdb)){
      cat("\nSD\n")
      print(x$sdb)
    }
    cat("\nestimated hyperparameters:\n")
    print(x$hyper_est)
    if(!is.null(x$eff_sample)) {
	    cat("\neffective sample size for hyperparameters/variance components: \n")
	    print(x$eff_sample)
    }
    if(!is.null(x$train)){
      cat("\ncross-validation accuarcy: \n")
      cat(round(cor(x$y[!x$train],x$yhat[!x$train]),3))
    }
}

#'  Set initial values for hyperparameters
#' @param y character phenotypes name 
#' @param data data.frame containing phenotypes
#' @param geno genotype matrix
#' @param genoid character of the column name of `data` representing the id related the rowname of `geno`
#' @param df degrees of freedom parameter
#' @param scale scale parameter, if is `NULL` set automatically 
#' @param vare residual variance, if is `NULL` set automatically 
#' @param g starting values for SNP effects, if is `NULL` will start from 
#' @param beta starting values for fixed effects, 
#' @param pi_snp proportion of SNP that have non-zero effect for BayesB and SSVS, default is 0.05 
#' @param h2 hertiability
#' @param c ratio for the smaller variance for SSVS
#' @param model the model used for analysis, can be c("rrBLUP","GBLUP", "BayesA","BayesB", "SSVS","ssGBLUP", "ssBayesA","ssBayesB", "ssSSVS","anteBayesA","anteBayesB")
#' @param centered logical indicating if Z is centered
#' @param from the source of scale, if it's 'rrBLUP', the scale will be adjusted for BayesA and SSVS
#' @return a list of initial values
#' @examples \dontrun{
#'  data(Pig)
#'  init=set_init(driploss~1,data=PigPheno,geno=geno,genoid = ~id,model="SSVS",centered=TRUE)
#' }
#' @export
set.init <- function(formula,data,geno,genoid,df=5,scale=NULL,vare=NULL,g=NULL,
                     beta=NULL,pi_snp=0.05,post_prob=NULL,h2=0.5,c=1000,
                     model=c("GBLUP","rrBLUP", "BayesA","BayesB", "SSVS","ssGBLUP", "ssBayesA","ssBayesB", "ssSSVS","anteBayesA","anteBayesB"),
                     varGenetic=NULL,centered=T,from=NULL) {    
  #if(!is.character(y)) stop("Phenotype name has to be an character")
  model<-match.arg(model)
  
  #id<-model.frame(as.formula(paste0("~",genoid)),data=data,na.action = na.pass)
  id<-model.frame(genoid,data=data,na.action = na.pass)
  id <- eval(id, parent.frame())
  id <-as.character(t(id))
  
  if(substr(model,1,2)=="ss"){
    idgeno<-id[which(id %in% rownames(geno))]
    Z<-geno[idgeno,]  
  }else(
    Z<-geno[id,]
  )
  
  
  data<- data %>% filter (id %in% rownames(Z))
  #mf <- model.frame(as.formula(paste0(y,"~0")), data = data, na.action = na.pass)
  mf <- model.frame(formula, data = data, na.action = na.pass)
  mf <- eval(mf, parent.frame())
  y <- as.numeric(t(mf))
  
  if(is.null(scale)){
    if(centered){
      z2=apply(Z,2,function(x) sum(x^2))
      sumMeanSq= sum((apply(Z,2,mean))^2)
      MSz=sum(z2)/dim(Z)[1]-sumMeanSq
    }else{
      zc=std_geno(Z,method="c")
      z2=apply(zc,2,function(x) sum(x^2))
      sumMeanSq= sum((apply(zc,2,mean))^2)
      MSz=sum(z2)/dim(zc)[1]-sumMeanSq
    }
    tmpSigmaG=rrBLUP=h2*var(y)/MSz
    scale=switch(model,rrBLUP=h2*var(y)/MSz,GBLUP=h2*var(y)/MSz,BayesA=(df-2)*h2*var(y)/df/MSz,
                 SSVS=c*h2*var(y)/MSz/(c+(1-pi_snp)*(1-c)),BayesB=(df-2)*h2*var(y)/df/MSz/pi_snp,
                 ssBayesA=(df-2)*h2*var(y)/df/MSz,anteBayesA=(df-2)*h2*var(y)/df/MSz,
                 ssSSVS=c*h2*var(y)/MSz/(c+(1-pi_snp)*(1-c)),ssBayesB=(df-2)*h2*var(y)/df/MSz/pi_snp,
                 anteBayesB=(df-2)*h2*var(y)/df/MSz/pi_snp)
  }else{
    if(!is.null(from)){
      if(from %in% c("rrBLUP","GBLUP")){
        tmpSigmaG=scale
        if(model %in% c("BayesA","ssBayesA")) scale=(df-2)/df*scale
        if(model %in% c("BayesB","ssBayesB")) scale=(df-2)/df*scale/pi_snp
        if(model %in% c("SSVS","ssSSVS")) scale=c/(c+(1-pi_snp)*(1-c))*scale
      }
    }
  }

  if(is.null(vare)){
    vare= var(y)*(1-h2) #*(df+2) df=-1 #crossprod(y)/length(y)
  }
  
	if(!(model%in%c("BayesB","ssBayesB","anteBayesB","SSVS","ssSSVS"))) pi_snp=1
	
  if(substr(model,1,2)=="ss"){
    if(is.null(varGenetic)) varGenetic=tmpSigmaG
    init<-list(vare=vare,df=df,scale=scale,g=g,beta=beta,pi=pi_snp,varGenetic=varGenetic)
    if(model %in% c("SSVS","ssSSVS")) init<-list(vare=vare,df=df,scale=scale,g=g,beta=beta,pi=pi_snp,post_prob=post_prob,c=c,varGenetic=varGenetic)
  }else{
    init<-list(vare=vare,df=df,scale=scale,g=g,beta=beta,pi=pi_snp)
    if(model %in% c("SSVS","ssSSVS")) init<-list(vare=vare,df=df,scale=scale,g=g,beta=beta,pi=pi_snp,post_prob=post_prob,c=c)
    
  }	
  init
}




#' 
#' Re-center genotype matrix
#' @title Re-center genotype matrix
#' @param x genotype matrix or `baData` object
#' @param method the method used for centering, `c` is centering and `s` is standardizing 
#' @param freq allele frequency of the one coded as 1, if not supplied, will use the frequency in \code{x}
#' @return centered genotype matrix  or `baData` object with centered genotype matrix
#' @examples \dontrun{
#'  data("Pig")
#'  #Standardize genotype matrix with the allele frequency in F0
#'  geno=std_geno(PigM,method="s",freq=PigAlleleFreq)
#' }
#' @export
std_geno <- function(x,method=c("s","c"),freq=NULL) {  
    Z=x
    method<-match.arg(method)
    if(method=="c"){
    	Zc<-apply(Z,2,function(x) (x-mean(x)))	
    }else if(method=="s"){
		  if(is.null(freq)) p<-apply(Z,2,mean)/2 else p<-freq
		  tmp<-sweep(Z,2L,2*p,FUN="-")
		  #d=sapply(sqrt(2*p*(1-p)),function(x) if(x==0) x=1 else x=x,simplify=T) ###added code to check if p==0
		  d=sqrt(sum(2*p*(1-p)))
		  Zc<-tmp/d #sweep(tmp,2L,d,FUN="/")
	}else{stop("method should be either 'c' or 's'")}
	Zc
}


#' Quickly create cross-validation folds
#' @param data dataframe contain the phenotypes
#' @param k number of folds
#' @param formula formula of the trait name
#' @return data.frame of the all traits including the cross-validation folds
#' @examples \dontrun{
#' set.seed(1234)
#' PigPheno=createCV(data = PigPheno,k=5,"driploss")
#' }
#' @export
createCV<-function(formula,data,k=5){
  cln<-colnames(data)
  coli=which(substr(cln,1,2)!="cv")
  data<-data[,coli]

  mf <- model.frame(formula , data = data, na.action = na.pass)
  mf <- eval(mf, parent.frame())
  y <- as.numeric(t(mf))
  
  cvs<-createFolds(y,k=k,list=F)
  cln<-colnames(data)
  newdata<-c()
  for(i in 1:k){
    newdata<-cbind(newdata,cvs!=i)
  }
  colnames(newdata)=c(paste0("cv",1:k))
  cbind(data,newdata)
}

###createFolds from caret R package
createFolds<-function (y, k = 10, list = TRUE, returnTrain = FALSE) 
{
  if (class(y)[1] == "Surv") 
    y <- y[, "time"]
  if (is.numeric(y)) {
    cuts <- floor(length(y)/k)
    if (cuts < 2) 
      cuts <- 2
    if (cuts > 5) 
      cuts <- 5
    breaks <- unique(quantile(y, probs = seq(0, 1, length = cuts)))
    y <- cut(y, breaks, include.lowest = TRUE)
  }
  if (k < length(y)) {
    y <- factor(as.character(y))
    numInClass <- table(y)
    foldVector <- vector(mode = "integer", length(y))
    for (i in 1:length(numInClass)) {
      min_reps <- numInClass[i]%/%k
      if (min_reps > 0) {
        spares <- numInClass[i]%%k
        seqVector <- rep(1:k, min_reps)
        if (spares > 0) 
          seqVector <- c(seqVector, sample(1:k, spares))
        foldVector[which(y == names(numInClass)[i])] <- sample(seqVector)
      }
      else {
        foldVector[which(y == names(numInClass)[i])] <- sample(1:k, 
                                                               size = numInClass[i])
      }
    }
  }
  else foldVector <- seq(along = y)
  if (list) {
    out <- split(seq(along = y), foldVector)
    names(out) <- paste("Fold", gsub(" ", "0", format(seq(along = out))), 
                        sep = "")
    if (returnTrain) 
      out <- lapply(out, function(data, y) y[-data], y = seq(along = y))
  }
  else out <- foldVector
  out
}

#' Create Manhattan plot for posterior probability based models
#' @param ba an object of the class \code{ba} generated by baFit
#' @param type can be c("SNP","Win") for single SNP or window based Manhattan plot 
#' @param col color for the Manhattan plot 
#' @param ... other plot options 
#' @return Null. This function only prints to the screen.
#' @examples \dontrun{
#'  rm(list=ls())
#'  library(BATools)
#'  data("Pig")
#'  #Standardize genotype matrix
#   geno=std_geno(PigM,method="s",freq=PigAlleleFreq)
#   init=set_init("driploss",data=PigPheno,geno=geno,"id",df=5,pi_snp=0.001,h2=0.5,c=1000,model="SSVS",centered=TRUE)
#'  #or set your own starting values using 
#'  #init=list(df=5,scale=0.01,pi=1) 
#'  run_para=list(niter=2000,burnIn=1000,skip=10)
#'  print_mcmc=list(piter=500)
#'  update_para=list(df=FALSE,scale=TRUE,pi=F)
#'  op<-create.options(model="SSVS",method="MCMC",seed=1,priors=NULL,init=init,
#'                   update_para=update_para,run_para=run_para,save.at="SSVS",print_mcmc=print_mcmc)
#'  SSVS<-baFit(driploss~sex,data=PigPheno,geno=geno ,genoid = ~id,options = op,map=PigMap,GWA="Win")
#'  par(mfrow=c(1,2))
#'  man_plot_prob(SSVS)
#'  man_plot_prob(SSVS,type="Win")
#' }
#' @export
man_plot_prob<-function(ba,type=c("SNP","Win"), col = c("black", "red"),...){
  type<-match.arg(type)
  if(ba$model %in% c("rrBLUP","BayesA","anteBayesA","ssBayesA") && type=="SNP"){
    ba$pvalue=ba$bpvalue
    man_plot_pvalue(ba,...)
  }else{
    if(ba$GWA!="Win" && type=="Win") stop("Window based GWA is require to create plot for window based approach, redo analysis with GWA=Win")
    if(type=="SNP"){
      p<-ba$prob
      Chromsome_id=ba$map$chr
    }
    if(type=="Win"){
      p=ba$Wprob
      Chromsome_id=ba$Wchr
    }
    plot(p, pch = 20, col = col[(Chromsome_id%%2) + 1], ylab = "Posterior probability", xlab = "Chromsome",  axes = F,...)
    axis(2)
    lns <- (by(Chromsome_id,Chromsome_id , length))
    axis(1, at = c(0, cumsum(lns)[-length(lns)]) + as.vector(lns/2), labels = names(lns))
    box()
  }
  #abline(h = 0.9, lwd = 2, col = "green")
}


#' @export
man_plot_assoc<-function(ba, col = c("black", "red"),...){
  #type<-match.arg(type)
  if(ba$model %in% c("anteBayesA","anteBayesB")){
     map=ba$map
     if(max(map$chr)>1){
      #Tmpmap=map[which(rownames(map)%in%colnames(geno)),]
      ante_p=rep(0,max(map$chr))
      ante_p[1]=sum(map$chr==1)
      for(i in 2:length(ante_p))
      {
        ante_p[i]=sum(map$chr==i)+ante_p[i-1]
      }
    }else{
      ante_p=NULL
    }
      antet<-ba$ante_t[-ante_p]
      mapt<-ba$map[names(antet),]
      Chromsome_id=mapt$chr
   
    plot(abs(antet), pch = 20, col = col[(Chromsome_id%%2) + 1], ylab = "Abs(t)", xlab = "Chromsome",  axes = F,...)
    axis(2)
    lns <- (by(Chromsome_id,Chromsome_id , length))
    axis(1, at = c(0, cumsum(lns)[-length(lns)]) + as.vector(lns/2), labels = names(lns))
    box()
  }else{
    
    stop("This model is not antedependence model")
   
  }
  
}

#' Create Manhattan plot for pvalue based models
#' @param ba an object of the class \code{ba} generated by baFit
#' @param type can be c("SNP","Win") for single SNP or window based Manhattan plot 
#' @param col color for the Manhattan plot 
#' @param ylim the range of yaxis
#' @param ... other plot options 
#' @return Null. This function only prints to the screen.
#' @examples \dontrun{
#'  rm(list=ls())
#'  library(BATools)
#'  data("Pig")
#'  #Standardize genotype matrix
#   geno=std_geno(PigM,method="s",freq=PigAlleleFreq)
#   init=set_init("driploss",data=PigPheno,geno=geno,"id",df=5,pi_snp=1,h2=0.5,c=NULL,model="GBLUP",centered=TRUE)
#'  run_para=list(maxiter=100)
#'  update_para=list(df=FALSE,scale=TRUE,pi=FALSE)
#'  op<-create.options(model="GBLUP",method="REML",priors=NULL,init=init,
#'                   update_para=update_para,run_para=run_para,save.at="GBLUP",print_mcmc=NULL)
#'  gblup<-baFit(driploss~sex,data=PigPheno,geno=geno ,genoid = ~id,options = op,map=PigMap,GWA="SNP")
#'  par(mfrow=c(1,2))
#'  man_plot_pvalue(gblup)
#'  man_plot_pvalue(gblup,type="Win")
#' }
#' @export
man_plot_pvalue<-function(ba,type=c("SNP","Win"), col = c("black", "red"),ylim=c(0,12),...){
  type<-match.arg(type)
  if(ba$GWA!="Win" && type=="Win") stop("Window based GWA is require to create plot for window based approach, redo analysis with GWA=Win")
  if(type=="SNP"){
    p<-ba$pvalue
    Chromsome_id=ba$map$chr
  }
  if(type=="Win"){
    p=ba$Wpvalue
    Chromsome_id=ba$Wchr
  }
  plot(-log(p,10), pch = 20, col = col[(Chromsome_id%%2) + 1], ylab = "-log10-pvalue", xlab = "Chromsome",  axes = F,ylim = ylim,...)
  axis(2)
  lns <- (by(Chromsome_id,Chromsome_id , length))
  axis(1, at = c(0, cumsum(lns)[-length(lns)]) + as.vector(lns/2), labels = names(lns))
  box()
  threshold=ifelse(ba$model %in%c("GBLUP","rrBLUP","ssGBLUP") ,0.05/length(p),0.05)
  abline(h = -log(threshold, 10), lwd = 2, col = "green")
}


get_full_rank_X<-function(X,y){
  
  n <- length(y)
  Xcolnames <- dimnames(X)[[2]]
  Xrownames <-rownames(X)
  if(is.null(Xcolnames)) {
    Xcolnames <- paste("X.column",c(1:dim(as.matrix(X))[2]),sep="")
  }
  
  X <- matrix(X, n, length(X)/n)
  XtX=crossprod(X)
  qr <- qr(XtX)
  rankQ <- n-qr$rank
  if(qr$rank) {
    X <- matrix(X[, qr$pivot[1:qr$rank]],n,qr$rank)
    Xcolnames <- Xcolnames[qr$pivot[1:qr$rank]]
  } else {
    cat("\nERROR: X has rank 0\n\n")
  }
  colnames(X)=Xcolnames
  rownames(X)=Xrownames
  return(X)
}
chenchunyu88/BATools documentation built on May 19, 2019, 8:21 a.m.