#' @title Read a VEGAS gene list
#'
#' @description Reads a text file containing output generated by the VEGAS
#' software
#'
#' @param fn Name of the file tor read
#' @param path Path to file, defaults to \code{.}
#' @param adjPermP FLag indicating whether to adjust the permutation p-values
#' reported by VEGAS, see below. Defaults to \code{TRUE}.
#' @param p.adjust The multiple testing adjustment for the gene-wise
#' p-values; can be any the methods in \code{\link{p.adjust.methods}}
#' except for \code{none}.
#'
#' @details VEGAS returns permutation p-values that are exactly zero, which is
#' of course impossible. The read-in function adjusts this by including the
#' observed test statistic in the null-count.
#'
#' @return A data frame containing the data with extra class attribute
#' \code{VEGAS}. This is a straightforward wrapper for the raw data,
#' except for the extra column with adjusted p-values, with is named
#' \code{adjPvalue.<adjustment method>}.
#'
#' #' @seealso \code{\link[annotate]{htmlpage}} \code{\link{topTable}}
#'
#' @export
readVEGAS = function(fn, path=".", adjPermP=TRUE, p.adjust="holm")
{
fn = file.path(path, fn)
ret = read.table(fn, header=TRUE)
if (adjPermP) {
ret$Pvalue = adjustPermP(ret$Pvalue, ret$nSim)
}
p.adjust = match.arg(p.adjust, setdiff(p.adjust.methods, "none"))
adjp = p.adjust(ret$Pvalue, p.adjust)
ret = cbind(ret, adjp)
colnames(ret)[ncol(ret)] = paste("adjPvalue", p.adjust, sep=".")
class(ret) = c("VEGAS", "data.frame")
ret
}
adjustPermP = function(p, N)
{
(p*N+1)/(N+1)
}
updateAdjP = function(x)
{
nc = ncol(x)
method = strsplit(colnames(x)[nc], "\\.")[[1]][2]
x[, nc] = p.adjust(x$Pvalue, method)
x
}
#' Summarize VEGAS data
#'
#' Display a short summary of a VEGAS dataset
#'
#' @param x A VEGAS object
#' @param ... Other arguments, currently ignored
#'
#' @return The same object, invisibly
#' @export
summary.VEGAS = function(x, ...)
{
n = nrow(x)
dup = length(which(duplicated(x$Gene)))
p = summary(x$Pvalue)
cat(n, " genes total (", dup, " duplicate IDs)\n\n", sep="")
cat("Genewise p-values:\n")
print(p)
invisible(x)
}
#' Handle duplicates in VEGAS gene lists
#'
#' Return entries with duplicate gene names, or drop them from a VEGAS gene list
#'
#' @param x A gene list of class \code{VEGAS}
#'
#' @return A shorter VEGAS gene lists, either consisting of only duplciate entries
#' or the original list without the duplicated entries
#'
#' @export
showDuplicates = function(x) x[x$Gene %in% x$Gene[duplicated(x$Gene)], ]
#' @rdname showDuplicates
#'
#' @details When dropping duplicates, it is the second entry from the top that
#' is excluded. Strategic pre-sorting of the list can be useful if you want to
#' be more specific. Note that the adjusted p-values will be updated to
#' reflect the smaller number of genes
#'
#' @export
dropDuplicates = function(x) updateAdjP(x[!duplicated(x$Gene), ])
#' Intersect VEGAS results
#'
#' Takes any number of VEGAS data objects and returns a list with VEGAS data sets
#' reduced to the shared set of genes
#'
#' @param ... A list of VEGAS objects, separated by commas
#'
#' @details The p-value adjustment for multiple testing is updated.
#'
#' @return A list of VEGAS objects
#' @export
intersectVEGAS = function(...)
{
args = list(...)
namu = as.character(substitute(list(...)))[-1]
## Some checks
nargs = length(args)
if (nargs < 2) stop("Need at least two lists to intersect")
isVEGAS = sapply(args, function(x) "VEGAS" %in% class(x) )
if (!all(isVEGAS)) stop("This function intersections only VEGAS objects")
## Build the intersect
gl = lapply(args, function(x) as.character(x$Gene))
is = gl[[1]]
for (i in 2:nargs) {
is = intersect(is, gl[[i]])
}
## Cut down the gene lists, sort and return with the names of the original
## objects
ret = lapply(args, function(x) subset(x, as.character(Gene) %in% is) )
ret = lapply(ret, function(x) x[order(x$Gene), ] )
ret = lapply(ret, updateAdjP)
names(ret) = namu
ret
}
#' Display expected and observed counts of co-significant genes
#'
#' Given two gene lists of class \code{VEGAS} and a p-value threshold, this
#' function reports the number of genes below this threshold on both lists,
#' with a 95\% confidence interval for the true value, as well as the
#' the expected number under the null hypothesis of no association
#' between the gene lists, and the corresponding p-value
#'
#' @param x,y Two gene lists of class \code{VEGAS}
#' @param co Cutoff for selecting statistically significant genes in both lists
#' @param adjusted Logical flag indicating whether to use the raw or adjusted p-values (default: \code{FALSE}).
#'
#' @return \code{counts} returns a named vector with five components: \code{Obs},
#' the number of genes observed to be statistically significant on both lists at the chosen
#' cutoff, \code{LCL} and \code{UCL}, the corresponding confidence interval
#' for the number of overlapping genes, \code{Exp}, the number of genes
#' expected to be significant on both lists under the assumption of no
#' association between the underlying traits, and \code{p.value}, the
#' corresponding two-sided p-value.
#'
#' @details Confidence intervals, expected count and p-value are based
#' on \code{binom.test}. The expected count under the null hypothesis
#' is conditional on the p-values observed for each trait individually,
#' i.e. we assume that the probability to be significant on both lists is
#' the product of the marginal proportions of significant genes on each
#' list.
#' \strong{Warning:} inference assumes independence between genes, which is a rather
#' strong assumption in this setting.
#'
#' @export
counts = function(x, y, co=0.05, adjusted=FALSE)
{
nn = nrow(x)
if (!adjusted) {
pval_namx = pval_namy = "Pvalue"
} else {
cnx = colnames(x)
pval_namx = cnx[pmatch("adjPvalue", cnx)]
cny = colnames(y)
pval_namy = cny[pmatch("adjPvalue", cny)]
}
if (pval_namx != pval_namy) warning("You are comparing two differently adjusted p-values. This is probably a bad idea.")
x0 = x[, pval_namx] <= co
y0 = y[, pval_namy] <= co
obs = length(which(x0 & y0))
p0 = length(which(x0))*length(which(y0))/(nn*nn)
exp = p0*nn
bin = binom.test( c(obs, nn-obs), p=p0)
LCL = bin$conf.int[1]*nn
UCL = bin$conf.int[2]*nn
pval = bin$p.value
c(round(c(Obs=obs, LCL=LCL, UCL=UCL, Exp=exp)), p.value=pval)
}
#' @rdname counts
#'
#' @param minP,maxP Smallest and largest cutoff value for considering the
#' overlap in significant genes between lists
#' @param nP Number of intermediate points between \code{minP} and \code{maxP}
#' @param legend Logical flag indicating whether to add a legend to the plot
#' @param ylim,title,xlab,ylab,... Standard graphical prarameters for \code{plot}
#' to override the function defaults
#'
#' @return \code{plotCounts} generates a plot of observed/expected counts as a
#' function of the specified cutoff range and returns invisibly a data frame
#' with the cutoffs (column \code{co}) and the corresponding output from
#' \code{counts}.
#'
#' @export
plotCounts = function(x, y, minP=1E-6, maxP=0.1, nP=100, adjusted=FALSE, legend=TRUE, ylim, title, xlab, ylab, ...)
{
xx = seq(minP, maxP, length=nP)
cnts = t(sapply(xx, function(p) counts(x, y, co=p, adjusted=adjusted)))
if (missing(ylim)) {
ylim = c(min(cnts), max(cnts))
}
if (missing(title)) {
title = paste(deparse(substitute(x)), " vs. ", deparse(substitute(y)), sep="")
}
if (missing(xlab)) {
xlab = "p-value threshold"
}
if (missing(ylab)) {
ylab = "Counts double significant"
}
plot(xx, cnts[,4], type="l", ylim=ylim, xlab=xlab, ylab=ylab, main=title, lwd=2, ...)
lines(xx, cnts[,1], lwd=2, col="red")
lines(xx, cnts[,2], lty=2, col="red")
lines(xx, cnts[,3], lty=2, col="red")
if (legend) {
legend("topleft", c("Observed", "95% CI", "Expected under H0"), col=c("red", "red", "black"), lty=c(1,2,1), lwd=c(2,1,2))
}
ret = data.frame(cnts, co=xx)
invisible(ret)
}
#' Display the most significant genes
#'
#' Displays a specfied number of top genes below a specified p-value threshold
#'
#' @param x A gene list object of class \code{VEGAS}
#' @param nmax The maximum number of genes to display
#' @param co Cutoff for the genewise p-values: only genes with an
#' adjusted p-value less or equal this cutoff are considered for display
#'
#' @return An object of class \code{VEGAS}
#' @export
topTable = function(x, nmax=30, co=1)
{
adjp = x[, ncol(x)]
ndx = adjp <= co
ret = subset(x, ndx)
ret = ret[order(ret$Pvalue, -ret$nSNPs, as.character(ret$Gene)), ]
head(ret, nmax)
}
#' Annotate a VEGAS genelist
#'
#' Adds gene name and Entrez identifier for genes in a VEGAS gene list
#'
#' @param x A gene list of class \code{VEGAS}
#' @param anndb An annotation database for an organism of class \code{\link{OrgDb-class}},
#' available from Bioconductor. The default is \code{\link{org.Hs.eg.db}}.
#'
#' @return If a valid annotation database is specified, an object of class
#' \code{annVEGAS} which inherits from class \code{VEGAS} and has two extra
#' columns (\code{Description} and \code{Entrez}). If no valid annotation
#' database is specified or the default database is not available, the original
#' \code{VEGAS} object and a warning are returned.
#'
#' @details This only works if an annotation database for the organism under study
#' is locally available. Also, current matching of gene names does not account
#' for aliases and does not work terribly well (FIXME).
#'
#' @export
annotateVEGAS = function(x, anndb)
{
if ("annVEGAS" %in% class(x)) return(x)
if (missing(anndb)) {
if(require("org.Hs.eg.db")) {
anndb = org.Hs.eg.db
} else {
warning("Cannot find database org.Hs.eg.db - no annotation")
return(x)
}
} else {
if (!inherits(anndb, "orgDb")) {
warning(deparse(substitute(anndb)), " is not of class orgDb - no annotation")
return(x)
}
}
gg = as.character(x$Gene)
namu = mapIds(anndb, keys=gg, keytype="SYMBOL", column="GENENAME")
entr = mapIds(anndb, keys=gg, keytype="SYMBOL", column="ENTREZID")
ret = cbind(x, Description=namu, Entrez=entr)
class(ret) = c("annVEGAS", class(x))
ret
}
#' Write an annotated VEGAS genelist to an HTML file
#'
#' This function combines the functionality of \code{\link{topTable}} and
#' \code{\link{annotateVEGAS}} to generate annotated lists of top significant
#' genes with links to the NCBI Entrez database and writes the resulting lists
#' to a local HTML file.
#'
#' @param x A gene list of class \code{VEGAS}
#' @param filename The name of the output file
#' @param title Title of the HTML page
#' @param nmax,co Maximum number of genes in output list and p-value cutoff, see
#' \code{\link{topTable}}
#'
#' @return This function is called for the side effect of generating an HTML
#' output file
#'
#' @seealso \code{\link[annotate]{htmlpage}}
#'
#' @export
exportVEGAS = function(x, filename, title, nmax=Inf, co=1)
{
if (missing(filename)) filename = paste(deparse(substitute(x)), ".html", sep="")
if (missing(title)) title = deparse(substitute(x))
title = paste("<H3>", title, "</H3>")
x = annotateVEGAS(topTable(x, nmax=nmax, co=co))
gl = list(as.character(x$Entrez))
other = subset(x, select=-Entrez)
table.head = c("Entrez", "Chr.", "Gene", "nSNPs", "nSims",
"Start", "Stop", "Test stat.", "p-value",
"Best SNP", "SNP.pval", "adj. p-value", "Description")
other$Pvalue = format.pval(other$Pvalue, digits=1)
other$SNP.pvalue = format.pval(other$SNP.pvalue, digits=1)
adjp_cn = ncol(other) - 1 ## variable name, alas
other[, adjp_cn] = format.pval(other[, adjp_cn], digits=1)
other$nSims = format(round(other$nSims, 1))
annotate::htmlpage(gl, filename, title, other, table.head, )
}
#' Extract p-values from a list of VEGAS objects
#'
#' This function takes the output from \code{} and
#' returns a matrix of p-values
#'
#' @param x A list returned by \code{intersectVEGAS}
#' @param co Upper cutoff for p-values to be included
#' @param min_cnt Minimum number of VEGAS data sets that have to have
#' a p-value smaller or equal to \code{co} for a gene to be included
#' @param adjusted Logical flag indicating whether to return adjusted
#' p-values or unadjusted ones (default).
#'
#' @return A matrix of p-values, possibly empty
#'
#' @seealso \code{\link{intersectVEGAS}}
#'
#' @export
getSharedP = function(x, co=1, min_cnt=2, adjusted=FALSE)
{
nc = if (adjusted) pmatch("adjPvalue", colnames(x[[1]])) else pmatch("Pvalue", colnames(x[[1]]))
sharedP = sapply(x, function(x) x[, nc])
rownames(sharedP) = as.character(x[[1]]$Gene)
cnt = apply(sharedP, 1, function(x) length(which(x <= co)))
ndx = cnt >= min_cnt
sharedP = sharedP[ndx,]
sharedP
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.