R/associationTest.R

Defines functions plotSelFreq plotQtc lmQtc medoQtc qtcatQtc qtcatHit qtcatPheno qtcatGeno

Documented in lmQtc medoQtc plotQtc plotSelFreq qtcatGeno qtcatHit qtcatPheno qtcatQtc

#' @title A genotype object constructor
#'
#' @description Constructs a S3-object containing a SNP design matrix and a hierarchy. If
#' a SNP in the input object contains missing data, the clustering is used to impute
#' information from highly correlated neighbor SNPs. The genotype object is needed for
#' \code{\link{qtcatHit}} as input.
#'
#' @param snp an object of S4 class \linkS4class{snpMatrix}.
#' @param snpClust an object of class \code{\link{qtcatClust}}.
#' @param absCor a vector of absolute value of correlations considered in the hierarchy.
#' @param min.absCor a minimum absolute value of correlation which is considered. A value
#' in the range from 0 to 1.
#' @param mc.cores a number of cores for parallelising. The maximum is \code{'B'}. For
#' details see \code{\link[parallel]{mclapply}}.
#'
#' @examples
#' # file containing example data for SNP data
#' gfile <- system.file("extdata/snpdata.csv", package = "qtcat")
#' snp <- read.snpData(gfile, sep = ",")
#' clust <- qtcatClust(snp)
#'
#' # Construct geotype object
#' geno <- qtcatGeno(snp, clust)
#'
#' @importFrom hit as.hierarchy
#' @importFrom methods is
#' @importFrom stats reorder
#' @export
qtcatGeno <- function(snp, snpClust, absCor, min.absCor = 0.5, mc.cores = 1) {
  stopifnot(is(snp, "snpMatrix"))
  stopifnot(is(snpClust, "qtcatClust"))
  if (!setequal(names(snpClust$clusters), colnames(snp)))
    stop("Names of 'snp' and 'snpClust' differ")
  if (missing(absCor))
    hier <- as.hierarchy(snpClust$dendrogram, 1 - min.absCor, names = colnames(snp))
  else
    hier <- as.hierarchy(snpClust$dendrogram, height = 1 - absCor, names = colnames(snp))
  if (any(naFreq(snp) > 0))
    snp <- imputeMedoids(snp, snpClust, hier, .25, mc.cores)
  if (is.null(names <- snpClust$medoids))
    names <- names(snpClust)
  snpnames <- colnames(snp)[colnames(snp) %in% names]
  desMat <- as.matrix(snp[, snpnames])
  hier <- reorder(hier, snpnames)
  out <- list(x = desMat,
              hierarchy = hier,
              clusters = snpClust$clusters,
              medoids = snpClust$medoids,
              snpInfo = snpInfo(snp))
  class(out) <- "qtcatGeno"
  out
}

#' @title A phenotype object constructor
#'
#' @description Constructs an S3-object containing phenotype and if additional covariats
#' exist a design matrix of those. The phenotype object is needed as input for
#' \code{\link{qtcatHit}}.
#'
#' @param names a vector of individual names of length 'n'.
#' @param pheno a vector of length 'n' or a matrix size 'n x 2' in case of binomial family.
#' This contains the phenotypic observations.
#' @param family a character string specifying the family of the phenotype distribution.
#' Either "gaussian" (default) or "binomial".
#' @param covariates a matrix typically generated by a call of
#' \code{\link[stats]{model.matrix}}. It contain additional variables influencing the
#' phenotype e.g. environmental and experimental covariates.
#'
#' @examples
#' # file containing example data for a phenotype.
#' pfile <- system.file("extdata/phenodata.csv", package = "qtcat")
#' pdat <- read.csv(pfile, header = TRUE)
#'
#' # Construct phenotype object
#' pheno <- qtcatPheno(names = pdat[, 1],
#'                     pheno = pdat[, 2],
#'                     covariates = model.matrix(~ pdat[, 3]))
#'
#' @export
qtcatPheno <- function(names, pheno, family = "gaussian", covariates = NULL) {
  match.arg(family, c("gaussian", "binomial"))
  nn <- length(names)
  if (any(is.na(pheno)))
    stop("Missing values in 'pheno' are not allowed")
  if (!is.null(covariates)) {
    if (is.matrix(covariates)) {
      nc <- nrow(covariates)
      if (all(covariates[, 1] == 1))
        covariates <- covariates[, -1, drop = FALSE]
    } else {
      stop("'covariates' has to be a 'matrix'")
    }
  } else {
    covariates <- matrix(nrow = nn, ncol = 0L)
    nc <- nn
  }
  if (is.vector(pheno)) {
    np <- length(pheno)
  } else if (is.matrix(pheno) && family == "binomial") {
    if (ncol(pheno) > 2)
      stop("'phone' has more than two columns which is not allowed")
    np <- nrow(pheno)
  } else {
    stop("'pheno' has to be a 'vector' or in in case of binomial family a 'matrix'")
  }
  if (nn == nc && nn == np)
    out <- list(names = as.character(names),
                pheno = pheno,
                covariates = covariates,
                family = family)
  class(out) <- "qtcatPheno"
  out
}


#' @title Fitting Hierarchical Inference Testing
#'
#' @description Hierarchical inference testing for phenotype-SNP association.
#'
#' @param pheno an object of class \code{\link{qtcatPheno}}.
#' @param geno an object of class \code{\link{qtcatGeno}}.
#' @param B a integer indicating the number of sample-splits.
#' @param p.samp1 a value specifying the fraction of data used for the LASSO sample-split.
#' The ANOVA sample-split is \code{1 - p.samp1}.
#' @param nfolds Number of folds (default is 5). See \code{\link[glmnet]{cv.glmnet}} for
#' more details.
#' @param overall.lambda 	Logical, if true, lambda is estimated once, if false (default),
#' lambda is estimated for each sample split.
#' @param lambda.opt a criterion for optimum selection of cross validated lasso. Either
#' "lambda.1se" (default) or "lambda.min". See
#' \code{\link[glmnet]{cv.glmnet}} for more details.
#' @param alpha a single value in the range of 0 to 1 for the elastic net mixing parameter.
#' @param gamma a vector of gamma-values used in significance estimation.
#' @param max.p.esti a maximum for computed p-values. All p-values above this value are set
#' to one. Small \code{max.p.esti} values reduce computing time.
#' @param seed a RNG seed, see \code{\link{set.seed}}.
#' @param mc.cores a number of cores for parallelising. The maximum is
#' \code{'B'}. For details see \code{\link[parallel]{mclapply}}.
#' @param trace logical, if \code{TRUE} it prints the current status of the program.
#' @param ... additional arguments for \code{\link[glmnet]{cv.glmnet}}.
#'
#' @examples
#' # If you want to run the examples, use:
#' # example(qtcatHit, run.dontrun = TRUE)
#' \dontrun{
#' # files containing example data for SNP data and the phenotype
#' pfile <- system.file("extdata/phenodata.csv", package = "qtcat")
#' gfile <- system.file("extdata/snpdata.csv", package = "qtcat")
#' pdat <- read.csv(pfile, header = TRUE)
#' snp <- read.snpData(gfile, sep = ",")
#' clust <- qtcatClust(snp)
#' geno <- qtcatGeno(snp, clust)
#' pheno <- qtcatPheno(names = pdat[, 1],
#'                     pheno = pdat[, 2],
#'                     covariates = model.matrix(~ pdat[, 3]))
#'
#' # fitting HIT
#' fitted <- qtcatHit(pheno, geno)
#' }
#'
#' @importFrom methods is
#' @export
qtcatHit <- function(pheno, geno, B = 50, p.samp1 = 0.35,
                     nfolds = 5, overall.lambda = FALSE, lambda.opt = "lambda.1se",
                     alpha = 1, gamma = seq(0.05, 0.99, by = 0.01),
                     max.p.esti = 1, seed = 12321, mc.cores = 1, trace = FALSE, ...) {
  set.seed(seed)
  on.exit(set.seed(NULL))
  stopifnot(is(pheno, "qtcatPheno"))
  stopifnot(is(geno, "qtcatGeno"))
  id <- intersect(pheno$names, rownames(geno$x))
  if (!length(id))
    stop("The ID intersect of 'pheno' and 'geno' is emty")
  if (length(id.uniqueGeno <- setdiff(rownames(geno$x), id)))
    cat("The following individuals are part of 'geno' but not of 'pheno':\n",
        paste(id.uniqueGeno, collapse = " "), "\n")
  if (length(id.uniquePheno <- setdiff(pheno$names, id)))
    cat("The following individuals are part of 'pheno' but not of 'geno':\n",
        paste(id.uniquePheno, collapse = " "), "\n")
  phenoInx <- which(pheno$names %in% id)
  genoInx <- match(pheno$names[phenoInx], rownames(geno$x))
  if (ncol(pheno$covariates) == 0L)
    x <- geno$x[genoInx, ]
  else
    x <- cbind(geno$x[genoInx, ], pheno$covariates[phenoInx, ])
  y <- pheno$pheno[phenoInx]
  fitHit <- hit(x, y, geno$hierarchy, pheno$family,
                B, p.samp1, nfolds, overall.lambda, lambda.opt, alpha,
                gamma, max.p.esti, mc.cores,
                trace, standardize = FALSE)
  out <- c(fitHit, geno[3L:5L])
  class(out) <- c("qtcatHit", "hit")
  out
}


#' @title Summarize results of Hierarchical Inference Test
#'
#' @description Summarizing the QTCs (significant cluster of SNPs) and their position at
#' the genome.
#'
#' @param object an object of class \code{\link{qtcatHit}}.
#' @param alpha an alpha level for significance estimation.
#' @param min.absCor a minimum absolute value of correlation to be considered.
#'
#' @examples
#' # If you want to run the examples, use:
#' # example(qtcatQtc, run.dontrun = TRUE)
#' \dontrun{
#' # files containing example data for SNP data and the phenotype
#' pfile <- system.file("extdata/phenodata.csv", package = "qtcat")
#' gfile <- system.file("extdata/snpdata.csv", package = "qtcat")
#' pdat <- read.csv(pfile, header = TRUE)
#' snp <- read.snpData(gfile, sep = ",")
#' clust <- qtcatClust(snp)
#' geno <- qtcatGeno(snp, clust)
#' pheno <- qtcatPheno(names = pdat[, 1],
#'                     pheno = pdat[, 2],
#'                     covariates = model.matrix(~ pdat[, 3]))
#' fitted <- qtcatHit(pheno, geno)
#'
#' # Summarizing the QTCs (loci37, loci260, and loci367 are causal)
#' qtcatQtc(fitted)
#' }
#'
#' @importFrom hit hit
#' @importFrom methods is
#' @export
qtcatQtc <- function(object, alpha = 0.05, min.absCor = 0.05) {
  stopifnot(is(object, "qtcatHit"))
  y <- summary(object, alpha, 1 - min.absCor)
  signames <- rownames(y)
  sigclust <- match(signames, object$medoids)
  sigClust <- matrix(0, length(object$clusters), 3L)
  sigClust[, 2L] <- 1
  for (i in seq_along(signames)) {
    inx <- which(object$clusters == sigclust[i])
    sigClust[inx, ] <- matrix(unlist(y[signames[i], ]),
                              length(inx), 3L, byrow = TRUE)
  }
  rownames(sigClust) <- names(object$clusters)
  sigClust[, 2L] <- 1 - sigClust[, 2L]
  colnames(sigClust) <- c(colnames(y)[1L], "absCor", colnames(y)[3L])
  sigClust <- as.data.frame(sigClust[sigClust[, 1L] != 0, ,drop = FALSE])
  out <- cbind(snpInfo(object)[rownames(sigClust), 1:2], sigClust)
  out
}


setOldClass("qtcatHit")
#' @title Get position from qtcatHit object
#'
#' @description Genetic position info from an object of class qtcatHit.
#'
#' @param object an object of class \code{\link{qtcatHit}}.
#'
#'
#' @importFrom methods setMethod setOldClass
#' @export
setMethod("snpInfo", "qtcatHit",
          function(object) {
            out <- object$snpInfo
            if (is.null(out)) {
              cat("No position information available")
            }
            out
          }
)


#' @title Find medoids of QTCs
#' @description Find a medoid of for each quantitative trait cluster (QTC).
#'
#' @param object an object of class \code{\link{qtcatHit}}.
#' @param geno an object of class \code{\link{qtcatGeno}}.
#' @param alpha an alpha level for significance estimation.
#' @param min.absCor minimum absolute value of correlation considered.
#'
#' @examples
#' # If you want to run the examples, use:
#' # example(medoQtc, run.dontrun = TRUE)
#' \dontrun{
#' # files containing example data for SNP data and the phenotype
#' pfile <- system.file("extdata/phenodata.csv", package = "qtcat")
#' gfile <- system.file("extdata/snpdata.csv", package = "qtcat")
#' pdat <- read.csv(pfile, header = TRUE)
#' snp <- read.snpData(gfile, sep = ",")
#' clust <- qtcatClust(snp)
#' geno <- qtcatGeno(snp, clust)
#' pheno <- qtcatPheno(names = pdat[, 1],
#'                     pheno = pdat[, 2],
#'                     covariates = model.matrix(~ pdat[, 3]))
#' fitted <- qtcatHit(pheno, geno)
#'
#' # QTC medoids
#' medo <- medoQtc(fitted, geno)
#' }
#'
#' @importFrom methods is
#' @importFrom stats cor
#' @export
medoQtc <- function(object, geno, alpha = 0.05, min.absCor = 0.05) {
  stopifnot(is(object, "qtcatHit"))
  stopifnot(is(geno, "qtcatGeno"))
  sigClust <- summary(object, alpha, min.absCor)
  if (nrow(sigClust)) {
    clusters <- split(rownames(sigClust), sigClust$clusters)
    medoids <- sapply(clusters, function(names, geno) {
      if (length(names) > 1L)
        return(names[which.max(abs(cor(geno$x[, names])))])
      else
        return(names)
    }, geno = geno)
  } else {
    medoids <- c()
  }
  medoids
}


#' @title Fitting a Linear Model to QTCs
#'
#' @description Linear model between phenotype and medoids of QTCs (significant SNP
#' clusters).
#'
#' @param object an object of class \code{\link{qtcatHit}}.
#' @param pheno an object of class \code{\link{qtcatPheno}}.
#' @param geno an object of class \code{\link{qtcatGeno}}.
#' @param alpha an alpha level for significance estimation.
#' @param min.absCor minimum absolute value of correlation considered.
#'
#' @examples
#' # If you want to run the examples, use:
#' # example(lmQtc, run.dontrun = TRUE)
#' \dontrun{
#' # files containing example data for SNP data and the phenotype
#' pfile <- system.file("extdata/phenodata.csv", package = "qtcat")
#' gfile <- system.file("extdata/snpdata.csv", package = "qtcat")
#' pdat <- read.csv(pfile, header = TRUE)
#' snp <- read.snpData(gfile, sep = ",")
#' clust <- qtcatClust(snp)
#' geno <- qtcatGeno(snp, clust)
#' pheno <- qtcatPheno(names = pdat[, 1],
#'                     pheno = pdat[, 2],
#'                     covariates = model.matrix(~ pdat[, 3]))
#' fitted <- qtcatHit(pheno, geno)
#'
#' # fitting a LM to the phenotype and QTC medoids
#' lmfitted <- lmQtc(fitted, pheno, geno)
#' }
#'
#' @importFrom methods is
#' @importFrom stats lm
#' @export
lmQtc <- function(object, pheno, geno, alpha = 0.05, min.absCor = 0.05) {
  stopifnot(is(object, "qtcatHit"))
  stopifnot(is(pheno, "qtcatPheno"))
  stopifnot(is(geno, "qtcatGeno"))
  id <- intersect(pheno$names, rownames(geno$x))
  phenoInx <- which(pheno$names %in% id)
  if (!length(id))
    stop("The ID intersect of 'pheno' and 'geno' is emty")
  if (length(id.uniqueGeno <- setdiff(rownames(geno$x), id)))
    cat("The following individuals are part of 'geno' but not of 'pheno':\n",
        paste(id.uniqueGeno, collapse = " "), "\n")
  if (length(id.uniquePheno <- setdiff(pheno$names, id)))
    cat("The following individuals are part of 'pheno' but not of 'geno':\n",
        paste(id.uniquePheno, collapse = " "), "\n")
  medoids <- medoQtc(object, geno, alpha, min.absCor)
  if (length(medoids)) {
    xg <- geno$x[, colnames(geno$x) %in% medoids, drop = FALSE]
    genoInx <- match(pheno$names[phenoInx], rownames(xg))
    rownames(xg) <- NULL
    if (ncol(pheno$covariates)) {
      dat <- data.frame(y = pheno$pheno[phenoInx],
                        pheno$covariates[phenoInx, ],
                        xg[genoInx, ])
    } else {
      dat <- data.frame(y = pheno$pheno[phenoInx], xg[genoInx, ])
    }
  } else if (ncol(pheno$covariates)) {
    dat <- data.frame(y = pheno$pheno[phenoInx],
                      pheno$covariates[phenoInx, ])
  } else {
    dat <- data.frame(y = pheno$pheno[phenoInx])
  }
  out <- lm(y ~ ., data = dat)
  out
}


#' @title Plot resulting QTCs of the Hierarchical Inference Test
#'
#' @description Plot the QTCs (significant cluster of SNPs) at their
#' position at the genome.
#'
#' @param x an object of class \code{\link{qtcatHit}}.
#' @param alpha an alpha level for significance estimation.
#' @param xlab a title for the x axis.
#' @param ylab a title for the y axis.
#' @param col.axis colors for axis line, tick marks, and title respectively.
#' @param ... other graphical parameters may also be passed as arguments to this function.
#'
#' @examples
#' # If you want to run the examples, use:
#' # example(plotQtc, run.dontrun = TRUE)
#' \dontrun{
#' # files containing example data for SNP data and the phenotype
#' pfile <- system.file("extdata/phenodata.csv", package = "qtcat")
#' gfile <- system.file("extdata/snpdata.csv", package = "qtcat")
#' pdat <- read.csv(pfile, header = TRUE)
#' snp <- read.snpData(gfile, sep = ",")
#' clust <- qtcatClust(snp)
#' geno <- qtcatGeno(snp, clust)
#' pheno <- qtcatPheno(names = pdat[, 1],
#'                     pheno = pdat[, 2],
#'                     covariates = model.matrix(~ pdat[, 3]))
#' fitted <- qtcatHit(pheno, geno)
#'
#' # Plot the QTCs (loci37, loci260, and loci367 are causal)
#' plotQtc(fitted)
#' }
#'
#' @importFrom graphics plot axis mtext
#' @export
plotQtc <- function(x, alpha = 0.05, xlab = "Chromosomes",
                    ylab = expression(-log[10](italic(p))), col.axis = NULL, ...) {
  stopifnot(is(x, "qtcatHit"))
  # make positions linear with gaps between chr's
  pos <- snpInfo(x)[, 1:2]
  chrminmax <- vapply(split(pos[, 2L], pos[, 1L]), function(x) c(min(x), max(x)), c(1, 2))
  chrgap <- sum(chrminmax[2L, ]) * .01
  chrsize <- cumsum(c(0, chrminmax[2L, -ncol(chrminmax)])) -
    cumsum(chrminmax[1L, ]) +
    cumsum(c(0, rep(chrgap, ncol(chrminmax) - 1L)))
  chr <- sort(unique(pos[, 1L]))
  for (i in seq_along(chr)) {
    inx <- which(pos[, 1L] == chr[i])
    pos[inx, 2L] <- pos[inx, 2L] + chrsize[i]
  }
  chrstartend <- vapply(split(pos[, 2L], pos[, 1L]), function(x) c(min(x), max(x)), c(1, 2))
  xlim <- c(chrstartend[1L, 1L] - chrgap, chrstartend[2L, ncol(chrstartend)] + chrgap)
  # get QTCs from result
  qtc <- qtcatQtc(x, alpha = alpha, min.absCor = .01)
  for (i in seq_along(chr)) {
    inx2 <- which(qtc[, 1L] == chr[i])
    qtc[inx2, 2L] <- qtc[inx2, 2L] + chrsize[i]
  }
  qtc$pValues[qtc$pValues] <- qtc$pValues[qtc$pValues] + 1e-308
  qtc$log.pValues <- -log10(qtc$pValues)
  # plot
  plot(qtc$pos, qtc$log.pValues,  xlim = xlim, ylim = c(0, max(9, qtc$log.pValues) + 1),
       axes = FALSE, xlab = "", ylab = "", ...)
  # x
  for (i in seq_along(chr))
    axis(1, labels = FALSE, at = c(chrstartend[1, i], chrstartend[2, i]), col = col.axis)
  axis(1, at = colMeans(chrstartend), labels = chr, col = NA, col.axis = col.axis)
  mtext(xlab, 1, 2.5, col = col.axis)
  # y
  axis(2, col.axis = col.axis, col = col.axis)
  mtext(expression(-log[10](italic(p))), 2, 2.5, col = col.axis)
}


#' @title Plot markers selection frequencies of the Hierarchical Inference Test
#'
#' @description Plot markers selection frequencies at their
#' position at the genome.
#'
#' @param x an object of class \code{\link{qtcatHit}}.
#' @param xlab a title for the x axis.
#' @param ylab a title for the y axis.
#' @param col.axis colors for axis line, tick marks, and title respectively.
#' @param ... other graphical parameters may also be passed as arguments to this function.
#'
#' @examples
#' # If you want to run the examples, use:
#' # example(plotSelFreq, run.dontrun = TRUE)
#' \dontrun{
#' # files containing example data for SNP data and the phenotype
#' pfile <- system.file("extdata/phenodata.csv", package = "qtcat")
#' gfile <- system.file("extdata/snpdata.csv", package = "qtcat")
#' pdat <- read.csv(pfile, header = TRUE)
#' snp <- read.snpData(gfile, sep = ",")
#' clust <- qtcatClust(snp)
#' geno <- qtcatGeno(snp, clust)
#' pheno <- qtcatPheno(names = pdat[, 1],
#'                     pheno = pdat[, 2],
#'                     covariates = model.matrix(~ pdat[, 3]))
#' fitted <- qtcatHit(pheno, geno)
#'
#' # Plot the selection frequncy of markers (loci37, loci260, and loci367 are causal)
#' plotSelFreq(fitted)
#' }
#'
#' @importFrom graphics plot axis mtext
#' @export
plotSelFreq <- function(x, xlab = "Chromosomes", ylab = "Sel. freq.",
                        col.axis = NULL, ...) {
  stopifnot(is(x, "qtcatHit"))
  # make positions linear with gaps between chr's
  pos <- snpInfo(x)[, 1:2]
  chrminmax <- vapply(split(pos[, 2L], pos[, 1L]), function(x) c(min(x), max(x)), c(1, 2))
  chrgap <- sum(chrminmax[2L, ]) * .01
  chrsize <- cumsum(c(0, chrminmax[2L, -ncol(chrminmax)])) -
    cumsum(chrminmax[1L, ]) +
    cumsum(c(0, rep(chrgap, ncol(chrminmax) - 1L)))
  chr <- sort(unique(pos[, 1L]))
  for (i in seq_along(chr)) {
    inx <- which(pos[, 1L] == chr[i])
    pos[inx, 2L] <- pos[inx, 2L] + chrsize[i]
  }
  chrstartend <- vapply(split(pos[, 2L], pos[, 1L]), function(x) c(min(x), max(x)), c(1, 2))
  xlim <- c(chrstartend[1L, 1L] - chrgap, chrstartend[2L, ncol(chrstartend)] + chrgap)
  # hit lasso selection fequency
  inx <- which(x$selectFreq > 0)
  selfreq <- cbind(pos[names(x$hier), 2L][inx], x$selectFreq[inx])
  # hit lasso selection fequency plot
  plot(selfreq, xlim = xlim, ylim = c(0, 1.1), axes = FALSE, xlab = "", ylab = "", ...)
  # x
  for (i in seq_along(chr))
    axis(1, labels = FALSE, at = c(chrstartend[1, i], chrstartend[2, i]), col = col.axis)
  axis(1, at = colMeans(chrstartend), labels = chr, col = NA, col.axis = col.axis)
  mtext(xlab, 1, 2.5, col = col.axis)
  # y
  axis(2, at = c(0, .5,  1), col = col.axis, col.axis = col.axis)
  mtext(ylab, 2, 2.5, col = col.axis)
}
QTCAT/qtcat documentation built on April 20, 2021, 11:20 p.m.