R/VEGAStools.R

#' @title Read a VEGAS gene list
#'
#' @description Reads a text file containing output generated by the VEGAS
#' software
#'
#' @param fn Name of the file tor read
#' @param path Path to file, defaults to \code{.}
#' @param adjPermP FLag indicating whether to adjust the permutation p-values
#'        reported by VEGAS, see below. Defaults to \code{TRUE}.
#' @param p.adjust The multiple testing adjustment for the gene-wise 
#' p-values; can be any the methods in \code{\link{p.adjust.methods}} 
#' except for \code{none}.
#'
#' @details VEGAS returns permutation p-values that are exactly zero, which is
#' of course impossible. The read-in function adjusts this by including the
#' observed test statistic in the null-count. 
#'
#' @return A data frame containing the data with extra class attribute 
#' \code{VEGAS}. This is a straightforward wrapper for the raw data, 
#' except for the extra column with adjusted p-values, with is named 
#' \code{adjPvalue.<adjustment method>}.
#'
#' #' @seealso \code{\link[annotate]{htmlpage}} \code{\link{topTable}}
#' 
#' @export
readVEGAS = function(fn, path=".", adjPermP=TRUE, p.adjust="holm")
{
	fn = file.path(path, fn)
	ret = read.table(fn, header=TRUE)
	if (adjPermP) {
		ret$Pvalue = adjustPermP(ret$Pvalue, ret$nSim)
	}
	p.adjust = match.arg(p.adjust, setdiff(p.adjust.methods, "none"))
	adjp     = p.adjust(ret$Pvalue, p.adjust)
	ret = cbind(ret, adjp)
	colnames(ret)[ncol(ret)] = paste("adjPvalue", p.adjust, sep=".")
	class(ret) = c("VEGAS", "data.frame")
	ret
}

adjustPermP = function(p, N)
{
	(p*N+1)/(N+1)
}

updateAdjP = function(x) 
{
	nc = ncol(x)
	method = strsplit(colnames(x)[nc], "\\.")[[1]][2]
	x[, nc] = p.adjust(x$Pvalue, method)
	x
}
	

#' Summarize VEGAS data
#'
#' Display a short summary of a VEGAS dataset
#'
#' @param x A VEGAS object
#' @param ... Other arguments, currently ignored
#'
#' @return The same object, invisibly
#' @export
summary.VEGAS = function(x, ...)
{
	n = nrow(x)
	dup = length(which(duplicated(x$Gene)))
	p = summary(x$Pvalue)

	cat(n, " genes total (", dup, " duplicate IDs)\n\n", sep="")
	cat("Genewise p-values:\n")
	print(p)

	invisible(x)
}	


#' Handle duplicates in VEGAS gene lists
#'
#' Return entries with duplicate gene names, or drop them from a VEGAS gene list
#'
#' @param x A gene list of class \code{VEGAS}
#'
#' @return A shorter VEGAS gene lists, either consisting of only duplciate entries
#' or the original list without the duplicated entries
#'
#' @export
showDuplicates = function(x) x[x$Gene %in% x$Gene[duplicated(x$Gene)], ]

#' @rdname showDuplicates
#'
#' @details When dropping duplicates, it is the second entry from the top that
#' is excluded. Strategic pre-sorting of the list can be useful if you want to
#' be more specific. Note that the adjusted p-values will be updated to
#' reflect the smaller number of genes
#'
#' @export
dropDuplicates = function(x) updateAdjP(x[!duplicated(x$Gene), ])


#' Intersect VEGAS results
#'
#' Takes any number of VEGAS data objects and returns a list with VEGAS data sets
#' reduced to the shared set of genes
#'
#' @param ... A list of VEGAS objects, separated by commas
#'
#' @details The p-value adjustment for multiple testing is updated.
#'
#' @return A list of VEGAS objects
#' @export
intersectVEGAS = function(...)
{
    args = list(...)
    namu = as.character(substitute(list(...)))[-1]
    
	## Some checks
	nargs = length(args)
    if (nargs < 2) stop("Need at least two lists to intersect")
    isVEGAS = sapply(args, function(x) "VEGAS" %in% class(x) )
    if (!all(isVEGAS)) stop("This function intersections only VEGAS objects")
    
    ## Build the intersect
    gl = lapply(args, function(x) as.character(x$Gene))
    is = gl[[1]]
    for (i in 2:nargs) {
		is = intersect(is, gl[[i]])
	}

	## Cut down the gene lists, sort and return with the names of the original
	## objects
	ret = lapply(args, function(x) subset(x, as.character(Gene) %in% is) )
	ret = lapply(ret, function(x) x[order(x$Gene), ] )
	ret = lapply(ret, updateAdjP)
	names(ret) = namu
	ret
}

#' Display expected and observed counts of co-significant genes
#'
#' Given two gene lists of class \code{VEGAS} and a p-value threshold, this
#' function reports the number of genes below this threshold on both lists,
#' with a 95\% confidence interval for the true value, as well as the 
#' the expected number under the null hypothesis of no association
#' between the gene lists, and the corresponding p-value
#'
#' @param x,y Two gene lists of class \code{VEGAS}
#' @param co Cutoff for selecting statistically significant genes in both lists
#' @param adjusted Logical flag indicating whether to use the raw or adjusted p-values (default: \code{FALSE}).
#'
#' @return \code{counts} returns a named vector with five components: \code{Obs},
#' the number of genes observed to be statistically significant on both lists at the chosen
#' cutoff, \code{LCL} and \code{UCL}, the corresponding confidence interval
#' for the number of overlapping genes, \code{Exp}, the number of genes
#' expected to be significant on both lists under the assumption of no
#' association between the underlying traits, and \code{p.value}, the 
#' corresponding two-sided p-value. 
#'
#' @details Confidence intervals, expected count and p-value are based 
#' on \code{binom.test}. The expected count under the null hypothesis 
#' is conditional on the p-values observed for each trait individually, 
#' i.e. we assume that the probability to be significant on both lists is 
#' the product of the marginal proportions of significant genes on each 
#' list. 
#' \strong{Warning:} inference assumes independence between genes, which is a rather
#' strong assumption in this setting.
#' 
#' @export
counts = function(x, y, co=0.05, adjusted=FALSE)
{
	nn = nrow(x)
	if (!adjusted) {
		pval_namx = pval_namy = "Pvalue"
	} else {
		cnx = colnames(x)
		pval_namx = cnx[pmatch("adjPvalue", cnx)]
		cny = colnames(y)
		pval_namy = cny[pmatch("adjPvalue", cny)]
	}
	if (pval_namx != pval_namy) warning("You are comparing two differently adjusted p-values. This is probably a bad idea.")

	x0 = x[, pval_namx] <= co
	y0 = y[, pval_namy] <= co		
	obs = length(which(x0 & y0))
	p0  = length(which(x0))*length(which(y0))/(nn*nn)
	exp = p0*nn
	bin = binom.test( c(obs, nn-obs), p=p0)
	LCL = bin$conf.int[1]*nn
	UCL = bin$conf.int[2]*nn
	pval  = bin$p.value
	c(round(c(Obs=obs, LCL=LCL, UCL=UCL, Exp=exp)), p.value=pval)
}

#' @rdname counts
#'
#' @param minP,maxP Smallest and largest cutoff value for considering the
#'                  overlap in significant genes between lists
#' @param nP Number of intermediate points between \code{minP} and \code{maxP}
#' @param legend Logical flag indicating whether to add a legend to the plot
#' @param ylim,title,xlab,ylab,... Standard graphical prarameters for \code{plot}
#'                   to override the function defaults
#'
#' @return \code{plotCounts} generates a plot of observed/expected counts as a
#' function of the specified cutoff range and returns invisibly a data frame
#' with the cutoffs (column \code{co}) and the corresponding output from
#' \code{counts}.
#'
#' @export
plotCounts = function(x, y, minP=1E-6, maxP=0.1, nP=100, adjusted=FALSE, legend=TRUE, ylim, title, xlab, ylab, ...)
{
	xx = seq(minP, maxP, length=nP)
	cnts = t(sapply(xx, function(p) counts(x, y, co=p, adjusted=adjusted)))
	if (missing(ylim)) {
		ylim = c(min(cnts), max(cnts))
	}
	if (missing(title)) {
		title = paste(deparse(substitute(x)), " vs. ", deparse(substitute(y)), sep="")
	}
	if (missing(xlab)) {
		xlab = "p-value threshold"
	}
	if (missing(ylab)) {
		ylab = "Counts double significant"
	}
	
	plot(xx, cnts[,4], type="l", ylim=ylim, xlab=xlab, ylab=ylab, main=title, lwd=2, ...)
	lines(xx, cnts[,1], lwd=2, col="red")
	lines(xx, cnts[,2], lty=2, col="red")
	lines(xx, cnts[,3], lty=2, col="red")
	if (legend) {
		legend("topleft", c("Observed", "95% CI", "Expected under H0"), col=c("red", "red", "black"), lty=c(1,2,1), lwd=c(2,1,2))
	}
	
	ret = data.frame(cnts, co=xx)
	invisible(ret)
}	

#' Display the most significant genes
#'
#' Displays a specfied number of top genes below a specified p-value threshold
#'
#' @param x A gene list object of class \code{VEGAS}
#' @param nmax The maximum number of genes to display
#' @param co Cutoff for the genewise p-values: only genes with an 
#'  adjusted p-value less or equal this cutoff are considered for display
#'
#' @return An object of class \code{VEGAS}
#' @export
topTable = function(x, nmax=30, co=1)
{
	adjp = x[, ncol(x)]
	ndx  = adjp <= co
	ret = subset(x, ndx)
	ret = ret[order(ret$Pvalue, -ret$nSNPs, as.character(ret$Gene)), ]
	head(ret, nmax)
}	

#' Annotate a VEGAS genelist
#'
#' Adds gene name and Entrez identifier for genes in a VEGAS gene list
#'
#' @param x A gene list of class \code{VEGAS}
#' @param anndb An annotation database for an organism of class \code{\link{OrgDb-class}},
#' available from Bioconductor. The default is \code{\link{org.Hs.eg.db}}.
#'
#' @return If a valid annotation database is specified, an object of class
#' \code{annVEGAS} which inherits from class \code{VEGAS} and has two extra
#' columns (\code{Description} and \code{Entrez}). If no valid annotation
#' database is specified or the default database is not available, the original
#' \code{VEGAS} object and a warning are returned. 
#'
#' @details This only works if an annotation database for the organism under study
#' is locally available. Also, current matching of gene names does not account
#' for aliases and does not work terribly well (FIXME).
#'
#' @export
annotateVEGAS = function(x, anndb)
{
	if ("annVEGAS" %in% class(x)) return(x)
	if (missing(anndb)) {
		if(require("org.Hs.eg.db")) {
			anndb = org.Hs.eg.db
		} else {
			warning("Cannot find database org.Hs.eg.db - no annotation")
			return(x)
		}
	} else {
		if (!inherits(anndb, "orgDb")) {
			warning(deparse(substitute(anndb)), " is not of class orgDb - no annotation")
			return(x)
		}
			
	}
    
    gg = as.character(x$Gene)
    namu = mapIds(anndb, keys=gg, keytype="SYMBOL", column="GENENAME")
    entr = mapIds(anndb, keys=gg, keytype="SYMBOL", column="ENTREZID")
    ret = cbind(x, Description=namu, Entrez=entr)
    class(ret) = c("annVEGAS", class(x))
    ret
}

#' Write an annotated VEGAS genelist to an HTML file
#'
#' This function combines the functionality of \code{\link{topTable}} and
#' \code{\link{annotateVEGAS}} to generate annotated lists of top significant
#' genes with links to the NCBI Entrez database and writes the resulting lists
#' to a local HTML file.
#'
#' @param x A gene list of class \code{VEGAS}
#' @param filename The name of the output file
#' @param title Title of the HTML page
#' @param nmax,co Maximum number of genes in output list and p-value cutoff, see
#' \code{\link{topTable}}
#'
#' @return This function is called for the side effect of generating an HTML
#' output file
#'
#' @seealso \code{\link[annotate]{htmlpage}}
#'
#' @export
exportVEGAS = function(x, filename, title, nmax=Inf, co=1)
{
	if (missing(filename)) filename = paste(deparse(substitute(x)), ".html", sep="")
	if (missing(title))    title = deparse(substitute(x))
	title = paste("<H3>", title, "</H3>")

	x = annotateVEGAS(topTable(x, nmax=nmax, co=co))
	gl = list(as.character(x$Entrez))
	other = subset(x, select=-Entrez)
	table.head = c("Entrez", "Chr.", "Gene", "nSNPs", "nSims",
	               "Start", "Stop", "Test stat.", "p-value",
	               "Best SNP", "SNP.pval", "adj. p-value", "Description")

	other$Pvalue = format.pval(other$Pvalue, digits=1)
	other$SNP.pvalue = format.pval(other$SNP.pvalue, digits=1)
	adjp_cn = ncol(other) - 1	## variable name, alas
	other[, adjp_cn] = format.pval(other[, adjp_cn], digits=1)	
	other$nSims  = format(round(other$nSims, 1))

	annotate::htmlpage(gl, filename, title, other, table.head, )
}	

#' Extract p-values from a list of VEGAS objects
#'
#' This function takes the output from \code{} and
#' returns a matrix of p-values
#'
#' @param x A list returned by \code{intersectVEGAS}
#' @param co Upper cutoff for p-values to be included
#' @param min_cnt Minimum number of VEGAS data sets that have to have 
#' a p-value smaller or equal to \code{co} for a gene to be included
#' @param adjusted Logical flag indicating whether to return adjusted
#' p-values or unadjusted ones (default).
#' 
#' @return A matrix of p-values, possibly empty
#'
#' @seealso \code{\link{intersectVEGAS}}
#'
#' @export
getSharedP = function(x, co=1, min_cnt=2, adjusted=FALSE)
{
	nc = if (adjusted) pmatch("adjPvalue", colnames(x[[1]])) else pmatch("Pvalue", colnames(x[[1]]))
	sharedP = sapply(x, function(x) x[, nc])	
	rownames(sharedP) = as.character(x[[1]]$Gene)

	cnt = apply(sharedP, 1, function(x) length(which(x <= co)))
	ndx = cnt >= min_cnt
	sharedP = sharedP[ndx,]
		
	sharedP
}


    
alexploner/VEGAStools documentation built on May 10, 2019, 8:57 a.m.