gwaR: Functions for performing GWA from GBLUP

# CODE: GWAS_meat_growth_traits Creation date:06/26/2015 Last modification:06/26/2015 Authors: DV SC JPS

#' efficiently calculates the diagonal of  \code{x*y*t(x)}.  
#' @title Efficient computation of diagonals of matrix products
#' @param x A matrix of size n by m
#' @param y A symetric matrix of size m by m
#' @return A vector of length n 
#' @export

diagprod <- function(x, y) {
    # by using this function the user will get the diagonal elements of X Y X' Check points If x and y are matrices
    if (!(is.matrix(x))) 
        stop("x must be a matrix")
    
    if (!(is.matrix(y))) 
        stop("y must be a matrix")
    
    # Rows in x are the same as columns in y
    if (ncol(x) != nrow(y)) 
        stop("Number of rows in x must be equal to the number of columns in y")
    
    # y is a square matrix
    if (nrow(y) != ncol(y)) 
        stop("y must be a square matrix")
    
    # ck the names if(colnames(x)!=rownames(y)) stop ('x and y must have the same order')
    
    diag_prod <- rowSums((x %*% y) * x)
    return(diag_prod)
}

#' A frequency table of number of significant tests at given thresholds
#' @title Significance summary for a \code{\link{gwas}} object
#' @param gwas an object of the class gwas generated by \code{\link{gwas}}
#' @param pvalue a vector with the p-value thresholds to build frequency table
#' @param correction a string with value 'bonf' if compute bonferroni correction is needed
#' @return a frequency table with the number of SNP significant at the thresholds specified by pvalue
#' @export

summary.gwas <- function(gwas, pvalue = c(0.001, 0.01, 0.025, 0.05, 0.1), correction = "bonf") {
    
    if (is.null(gwas)) 
        stop("missing GWAS object")
    if (all(class(gwas) != "gwas")) 
        stop("object must be of class gwas")
    if (min(pvalue) < 0 || max(pvalue) > 1) 
        stop("range of pvalue must be between 0 and 1")
    
    
    if (correction == "bonf") {
        # Get the value that will be the threshold for those p values
        threshold <- qnorm(-pvalue/(2 * nrow(gwas)) + 1)
    } else {
        threshold <- qnorm(-pvalue/2 + 1)
        warning("no multiple test correction")
    }
    
    
    names(threshold) <- sub(0, replacement = "<0", pvalue)
    # Crate Standarization of the values (standarized_values)
    standarized_values <- gwas[["ghat"]]/sqrt(gwas[["var_ghat"]])
    
    
    # Compare those Standarized effects with the threshold, check which ones are bigger
    signif_values <- t(as.matrix(rowSums(sapply(abs(standarized_values), ">=", threshold))))
    rownames(signif_values) <- "signif.values"
    
    # change the name in case we want to use a print function
    class(signif_values) <- "summary.gwas"
    
    return(signif_values)
}


#' A function to create q-qplots of pvalues from high dimensional test (e.g: GWA, Differential expression analyses).  
#' @title q-qplots of p-values 
#' @param pvector A vector of pvalues, assumed to follow a uniform distribution under the null hypothesis
#' @param add a logical value, T: adds to existing graphic or F: creates a new graphic
#' @param ... additional parameters to parse to plot function
#' @return NULL. A graphic will be generated
#' @export

qqgplot = function(pvector, add = F, ...) {
    o = -log10(sort(pvector, decreasing = F))
    e = -log10(1:length(o)/length(o))
    if (!add) {
        plot(e, o, ..., xlab = expression(Expected ~ ~-log[10](italic(p))), ylab = expression(Observed ~ ~-log[10](italic(p))), 
            xlim = c(0, max(e)), ylim = c(0, max(o) + 1))
    } else {
        points(e, o, ...)
    }
    lines(e, e, col = "red")
}

#' A function to create manhattan plots from GWA and QTL mapping analyses.  
#' @title Manhattan plots of p-values 
#' @param pvalues A vector of pvalues with SNP, marker or position names
#' @param map a genetic map: a data.frame with SNP or marker names in rows, chromosome name (chr column) and position (pos column)
#' @param threshold the threshold of significance 
#' @param col a vector of strings with color names for painting SNP according to their chromosome
#' @param chrom a vector of chromosomal names to be included in the graphic by default all chromosomes are included 
#' @param ... additional graphical parameters used by plot 
#' @return NULL. A graphic is be generated
#' @export

manhattan_plot<-function (pvalues, map, threshold = 0.01, col = c("black", "red"), 
                          chrom = NULL, ...) 
{
  if (!("chr" %in% colnames(map))) 
    stop("chr should be a column of map")
  if (sum(rownames(map) == names(pvalues)) < length(pvalues)) 
    stop("make sure that the values are ordered, rownames of map and names of p-value vector should agree")
  if ((max(pvalues) > 1) | (min(pvalues) < 0)) 
    stop("pvalues should be in the range [0,1]")
  if (sum(pvalues == 0) > 0) {
    pvalues[pvalues == 0] <- min(pvalues[pvalues > 0])
    warning("some p-values were exactly zero and they were replaced with the next smallest value")
  }
  if (!is.null(chrom)) {
    idx <- as.character(map$chr) %in% as.character(chrom)
    map <- map[idx, ]
    pvalues <- pvalues[idx]
  }
  if (length(col) < 2) 
    stop("two colors are needed in the color vector")
  lv<-as.character(unique(map$chr))
  cl<-col[(1:length(lv))%%2+1]
  names(cl)<-lv
  plot(-log(pvalues, 10), pch = 20, col = cl[as.character(map$chr)], 
       ylab = "-log10-pvalue", xlab = "chr", ..., axes = F)
  axis(2)
  lns <- (by(map, map$chr, nrow))
  lns<-lns[lv]
  axis(1, at = c(0, cumsum(lns)[-length(lns)]) + as.vector(lns/2), 
       labels = lv)
  box()
  abline(h = -log(threshold, 10), lwd = 2, col = "green")
}

#'  Computes pvalues from a GWA class object represents them graphically. using \code{\link{manhattan_plot}} and \code{\link{qqgplot}}
#' @title Significance plots and pvalues from \code{\link{gwas}} output
#' @param gwas an object of the class gwas generated by \code{\link{gwas}}
#' @param gpdata an object of class gpdata used to obtain a SNP map
#' @param correction a string with value 'bonf' if compute bonferroni correction is needed
#' @param plotlog10 a logical value indicating if a manhattan plot is requested
#' @param q.qplot a logical value indicating if a q-qplot is requested
#' @param pvalue significance threshold
#' @param ... 
#' @return a vector of pvalues with names equal to marker names. Optional output: Manhattan plot and q-qplot of pvalues
#' @export

plot.gwas <- function(gwas, correction = "bonf", gpdata = NULL, plotlog10 = FALSE, pvalue = 0.05, q.qplot = FALSE, chrom = NULL, 
    ...) {
    
    
    # get the significance level
    if (correction == "bonf") {
        # Get the value that will be the threshold for those p values
        threshold <- pvalue/nrow(gwas)
    } else {
        threshold <- pvalue
        warning("no multiple test correction")
    }
    map <- na.omit(gpdata$map)
    idx <- match(rownames(gwas), rownames(map))
    if (any(is.na(idx))) 
        warning("some markers in the gwa object are not present in the map")
    map <- map[idx, c("chr", "pos")]
    
    pval <- getpvalue(gwas, log.p = F)
    
    if (q.qplot == TRUE) {
        qqgplot(pvector = pval, main = NULL, add = F, ...)
    }
    
    if (plotlog10 == TRUE) {
        manhattan_plot(map = map, pvalues = pval, threshold = threshold, col = c("black", "red"), chrom = chrom, ...)
    }
    
}

#'  Pvalue for blup objects
#' @title pvalue computation for a \code{\link{gwas}} object
#' @param gwas an object of the class gwas generated by \code{\link{gwas}}
#' @param log.p a logical value to indicate if log10-pvalues are requested
#' @param is.z a logical value to indicate if z-scores are provided instead of estimate of snp effects and their variances
#' @return a vector of p-values or log10 p-values
#' @export

getpvalue <- function(gwas, log.p = T, is.z = F) {
    
    if (is.null(gwas)) 
        stop("missing GWAS object")
    if (is.z) {
        snpej <- gwas
    } else {
        if (all(class(gwas) != "gwas")) 
            stop("object must be of class gwas")
        # Get the standarized values of the SNP
        snpej <- gwas[["ghat"]]/sqrt(gwas[["var_ghat"]])
        names(snpej) <- rownames(gwas)
    }
    # Calculate the p value
    pval <- (2 - 3 * log.p) * pnorm(abs(snpej), lower.tail = F, log.p = log.p)/(log(10)^log.p) - log.p * log10(2)
    return(pval)
} 

#'  A function used internally to compute Confidence Intervals of GWA peaks
#' @title Not intended for direct use. See  \code{\link{IC_gwa}} for details
#' @param gbl a gblup object generated by \code{\link{gblup}}
#' @param Z a matrix of standardized genotypes
#' @param map a map dataset for the SNP in Z
#' @return the position of two peaks
#' @export

IC_base<-function(gbl,Z=NULL,map=NULL){
  fenames<-all.vars(gbl$model$formula)
  wt=NULL
  rnames<-all.vars(gbl$model$Vformula)
  if("wt"%in%rnames){
    wt<-diag(gbl$model$wt)
    names(wt)<-colnames(gbl$model$G)
  }
  rnames<-rnames[!rnames%in%c("G","wt","In")]
  if(length(all.vars(update(gbl$model$Vformula,.~.-G-In-wt)))==1){
    design=gbl$model$formula
  } else{
    design=c(gbl$model$formula,update(gbl$model$Vformula,.~.-G-In-wt))
  }
  
  rmname<-rnames[unlist(sapply(gbl$model[rnames],is.matrix))]
  rvname<-rnames[unlist(!sapply(gbl$model[rnames],is.matrix))]
  
  dts<-as.data.frame(gbl$model[names(gbl$model)%in%c(fenames,rvname)])
  
  vdata<-gbl$model[names(gbl$model)%in%rmname]
  
  

  lns<-nrow(dts)
  id<-sample(x = c(T,F),lns,replace = T)
  dt1<-dts[id,,drop=F]
  dt2<-dts[!id,,drop=F]
  nrw1<-gblup("y",data=dt1,design=design,G=gbl$model$G,vdata=vdata,wt=wt,pos=gbl$pos,start=gbl$sigma)
  nrw2<-gblup("y",data=dt2,design=design,G=gbl$model$G,vdata=vdata,wt=wt,pos=gbl$pos,start=gbl$sigma)
  
  t1<-gwas(nrw1,t(Z))
  p1<-which.max(abs(t1[,1]/sqrt(t1[,2])))
  t2<-gwas(nrw2,t(Z))
  p2<-which.max(abs(t2[,1]/sqrt(t2[,2])))
  
  d<-sort(c(map$pos[p1],map$pos[p2]))
  return(d)
}


#' A function usedto compute Confidence Intervals of GWA peaks
#' @title Computes confidence interval of GWA peak 
#' @param gbl a gblup object generated by \code{\link{gblup}}
#' @param Z a matrix of standardized genotypes
#' @param map a map dataset for the SNP in Z
#' @param rng a range of positions for which to compute the CI: a vector with three elements: chr, start, end 
#' @param peak the position of the GWA peak should be within the range 
#' @param alpha the confidence level for the confidence interval
#' @param iter the number of iterations 
#' @param NP a logical value to compute non-parametric estimate  
#' @return a data.frame with 3 rows: first row is corresponds to the lower limit, the central row corresponds to the peak and the 3rd row is the lower limit. the columns correspond to: a)  name of closest snp, b) SNP chromosome, c) SNP position, D) exact position
#' @export

IC_gwa <- function(gbl,Z=NULL,map=NULL,rng=NULL,peak=0,alpha=0.99,iter=500,NP=F){
  if (!is.null(rng)){
    if (sum(rng[1]%in%map$chr)==0){
      stop("chromosome in range is not in map")
    }
    map<-map[map$chr==rng[1],]
    map<-map[(map$pos>=rng[2])&(map$pos<=rng[3]),]
    if (nrow(map)<2){
      stop("too few rows in your map, please adjust range")
    }
    if (sum(!rownames(map)%in%colnames(Z))>0){
      stop("some SNP in the map not present in Z matrix")
    }
    Z<-Z[,rownames(map)]
    if(class(gbl)!="gblup"){
      stop("gbl should be a gblup object")
    }
  }
  
  if((peak<map$pos[1])|(peak>map$pos[length(map$pos)])){
    stop("peak should be within the range of snp in the map")
  }
  
  vs<-replicate(iter,IC_base(gbl,Z,map))
  if (!NP){
    vs<-vs[2,]-vs[1,]
    se<-sqrt(sum(vs^2)/(4*length(vs)))
    ft<-qnorm(1-(1-alpha)/2)
    IC<-c(peak-ft*se,peak+ft*se)} else{
      i<- seq(0,1-alpha,0.0001)
      ICt<-sapply(i,function(x) c(quantile(vs[1,],x),quantile(vs[2,],0.95-x)))
      lng<-ICt[2,]-ICt[1,]
      IC<-ICt[,which.min(lng)]
    }
  
  closestsnppeak<-which.min(abs(map$pos-peak))
  if((IC[1]<rng[2])|(IC[2]>rng[3])){
    cat("WARNING: confidence interval boundaries lie outside of the scanned region. It is recommended in increase the range and recompute IC\n")
  }
  closestsnpll<-which.min(abs(map$pos-IC[1]))
  closestsnpul<-which.min(abs(map$pos-IC[2]))
  selmap<-map[c(closestsnpll,closestsnppeak,closestsnpul),]
  selmap$exact<-c(IC[1],peak,IC[2])
  return(selmap)
}

#' A function to find secondary peaks in a GWA
#' @title a function to confirm GWA peaks and to detect secondary peaks within chromosomes  
#' @param gbl a gblup object generated by \code{\link{gblup}}
#' @param gwa a gblup object generated by \code{\link{gwa}}
#' @param Z a matrix of standardized genotypes
#' @param map a map dataset for the SNP in Z
#' @param threshold a significance threshold for GWA 
#' @param criteria the p-value correction to use 
#' @return list with three elements 
#' \itemize{ 
#'   \item {\code{gblup}} {the new gblup object with peak SNP as fixed effects} 
#'   \item{\code{gwas}} {a new gwas objectest to look for peaks}
#'   \item{\code{peaks}} {a map with the positions of the included peaks}
#'}
#' @export

find_peaks <- function(gbl,gwa,Z,map,threshold=0.95,criteria=c("qvalue","bonferroni","uncorrected")){
  criteria<-criteria[1]
  if (!criteria%in%c("qvalue","bonferroni","uncorrected")) {
    stop("the criteria should be qvalue, bonferroni or uncorrected")
  } 
  if (class(gbl)!="gblup"){
    stop("gbl should be of class gblup")
  }
  if(sum((rownames(gwa)!=colnames(Z))|((rownames(gwa)!=rownames(map))))>0){
    stop("map, gwa object and Z should have matching row/columns")
  }
  
  pv<-getpvalue(gwa,log.p = F)
  if(criteria=="qvalue"){
    pv<-qvalue(pv)$qvalue
  }
  if(criteria=="bonferroni"){
    pv<-1-(1-pv)^length(pv)
  }
  
  ns<-sum(pv<threshold)
  if(ns==0){
    stop("no SNP reach the significance threshold")
  }
  
  minv<-tapply(pv,map$chr,function(x) names(x)[which.min(x)])
  mins<-as.vector(by(pv,map$chr,min))
  
  bsf<-update(gbl$model$formula,as.formula(paste(".~.+",paste(minv[mins<threshold],sep="",collapse="+"))))
  
  fenames<-all.vars(gbl$model$formula)
  wt=NULL
  rnames<-all.vars(gbl$model$Vformula)
  if("wt"%in%rnames){
    wt<-diag(gbl$model$wt)
    names(wt)<-colnames(gbl$model$G)
  }
  rnames<-rnames[!rnames%in%c("G","wt","In")]
  if(length(all.vars(update(gbl$model$Vformula,.~.-G-In-wt)))==1){
    design=bsf
  } else{
    design=c(bsf,update(gbl$model$Vformula,.~.-G-In-wt))
  }
  
  rmname<-rnames[unlist(sapply(gbl$model[rnames],is.matrix))]
  rvname<-rnames[unlist(!sapply(gbl$model[rnames],is.matrix))]
  
  dts<-as.data.frame(gbl$model[names(gbl$model)%in%c(fenames,rvname)])
  tobind<-Z[rownames(dts),minv[mins<threshold],drop=F]
  colnames(tobind)<-minv[mins<threshold]
  dts<-cbind(dts,tobind)
  
  
  
  vdata<-gbl$model[names(gbl$model)%in%rmname]
  
  nrw1<-gblup("y",data=dts,design=design,G=gbl$model$G,vdata=vdata,wt=wt,pos=gbl$pos,start=gbl$sigma)
  t1<-gwas(nrw1,t(Z))
  return(list(gblup=nrw1,gwas=t1,peaks=map[minv[mins<threshold],]))
}