#' Creates a manhattanr object
#'
#' An object of class manhattanr includes all the needed information for
#' producing a manhattan plot. The goal is to seperate the pre-processing of the
#' manhattan plot elements from the graphical rendaring of the object, which
#' could be done using any graphical device including
#' \code{\link[plotly]{plot_ly}} and \code{\link[graphics]{plot}} in base
#' \code{R}.
#'
#' @param x A \code{data.frame} which must contain at least the following three
#' columns: \itemize{ \item{the chromosome number} \item{genomic base-pair
#' position} \item{a numeric quantity to plot such as a p-value or zscore} }
#' @param chr A string denoting the column name for the chromosome. Default is
#' \code{chr = "CHR"}. This column must be \code{numeric} or \code{integer}.
#' Minimum number of chromosomes required is 1. If you have X, Y, or MT
#' chromosomes, be sure to renumber these 23, 24, 25, etc.
#' @param bp A string denoting the column name for the chromosomal position.
#' Default is \code{bp = "BP"}. This column must be \code{numeric} or
#' \code{integer}.
#' @param p A string denoting the column name for the numeric quantity to be
#' plotted on the y-axis. Default is \code{p = "P"}. This column must be
#' \code{numeric} or \code{integer}. This does not have to be a p-value. It
#' can be any numeric quantity such as peak heights, bayes factors, test
#' statistics. If it is not a p-value, make sure to set \code{logp = FALSE}.
#' @param snp A string denoting the column name for the SNP names (e.g. rs
#' number). More generally, this column could be anything that identifies each
#' point being plotted. For example, in an Epigenomewide association study
#' (EWAS) this could be the probe name or cg number. This column should be a
#' \code{character}. This argument is optional, however it is necessary to
#' specify if you want to highlight points on the plot using the
#' \code{highlight} argument in the \code{\link{manhattanly}} function
#' @param gene A string denoting the column name for the GENE names. This column
#' could be a \code{character} or \code{numeric}. More generally this could be
#' any annotation information that you want to include in the plot. This
#' argument is optional.
#' @param annotation1 A string denoting the column name for an annotation. This
#' column could be a \code{character} or \code{numeric}. This could be any
#' annotation information that you want to include in the plot (e.g. zscore,
#' effect size, minor allele frequency). This argument is optional.
#' @param annotation2 A string denoting the column name for an annotation. This
#' column could be a \code{character} or \code{numeric}. This could be any
#' annotation information that you want to include in the plot (e.g. zscore,
#' effect size, minor allele frequency). This argument is optional.
#' @param logp If TRUE, the -log10 of the p-value is plotted. It isn't very
#' useful to plot raw p-values, but plotting the raw value could be useful for
#' other genome-wide plots, for example, peak heights, bayes factors, test
#' statistics, other "scores" etc.
#'
#' @return A \code{list} object of class \code{manhattanr} with the following
#' elements \describe{ \item{data}{processed data to be used for plotting}
#' \item{xlabel}{The label of the x-axis which is determined by the number of
#' chromosomes present in the data} \item{ticks}{the coordinates on the x-axis
#' of where the tick marks should be placed} \item{labs}{the labels for each
#' tick. This defaults to the chromosome number but can be changed in the
#' \code{\link{manhattanly}} function } \item{nchr}{the number of unique
#' chromosomes present in the data} \item{pName, snpName, geneName,
#' annotation1Name, annotation2Name}{The names of the columns corresponding to
#' the data provided. This information is used for annotating the plot in the
#' \code{\link{manhattanly}} function } }
#' @export
#' @source The pre-processing is mostly the same as the
#' manhattan function from the
#' \href{https://github.com/stephenturner/qqman}{\code{qqman}} package
#'
#' @examples
#' # HapMap dataset included in this package already has columns named P, CHR and BP
#' library(manhattanly)
#' DT <- manhattanr(HapMap)
#' class(DT)
#' head(DT[["data"]])
#'
#' # include snp and gene information
#' DT2 <- manhattanr(HapMap, snp = "SNP", gene = "GENE")
#' head(DT2[["data"]])
#' @seealso \code{\link{manhattanly}}
manhattanr <- function(x,
chr = "CHR",
bp = "BP",
p = "P",
snp,
gene,
annotation1,
annotation2,
logp = TRUE) {
# NULLing out strategy
CHR <- BP <- P <- index <- NULL
# Check for sensible dataset
# Make sure you have chr, bp and p columns.
if (!(chr %in% names(x))) stop(paste("Column", chr, "not found in 'x' data.frame"))
if (!(bp %in% names(x))) stop(paste("Column", bp, "not found in 'x' data.frame"))
if (!(p %in% names(x))) stop(paste("Column", p, "not found 'x' data.frame"))
# warn if you don't have a snp column
if (!missing(snp)) {
if (!(snp %in% names(x))) stop(sprintf("snp argument specified as %s but this column not found in 'x' data.frame", snp))
}
if (!missing(gene)) {
if (!(gene %in% names(x))) stop(sprintf("gene argument specified as %s but this column not found in 'x' data.frame", gene))
}
if (!missing(annotation1)) {
if (!(annotation1 %in% names(x))) stop(sprintf("annotation1 argument specified as %s but this column not found in 'x' data.frame", annotation1))
}
if (!missing(annotation2)) {
if (!(annotation2 %in% names(x))) stop(sprintf("annotation2 argument specified as %s but this column not found in 'x' data.frame", annotation2))
}
# make sure chr, bp, and p columns are numeric.
if (!is.numeric(x[[chr]])) stop(paste(chr, "column should be numeric. Do you have 'X', 'Y', 'MT', etc? If so change to numbers and try again."))
if (!is.numeric(x[[bp]])) stop(paste(bp, "column should be numeric."))
if (!is.numeric(x[[p]])) stop(paste(p, "column should be numeric."))
# Create a new data.frame with columns called CHR, BP, and P.
d <- data.frame(CHR = x[[chr]], BP = x[[bp]], P = x[[p]])
# If the input data frame has a SNP column, add it to the new data frame
# you're creating. Rename columns according to input
if (!missing(snp)) {
# using transform converts the snps to a factor variable!
d[["SNP"]] <- x[[snp]]
colnames(d)[which(colnames(d) == "SNP")] <- snp
}
if (!missing(gene)) {
d[["GENE"]] <- x[[gene]]
colnames(d)[which(colnames(d) == "GENE")] <- gene
}
if (!missing(annotation1)) {
d[["ANNOTATION1"]] <- x[[annotation1]]
colnames(d)[which(colnames(d) == "ANNOTATION1")] <- annotation1
}
if (!missing(annotation2)) {
d[["ANNOTATION2"]] <- x[[annotation2]]
colnames(d)[which(colnames(d) == "ANNOTATION2")] <- annotation2
}
# Set positions, ticks, and labels for plotting
# Sort and keep only values where is numeric.
d <- subset(d, (is.numeric(CHR) & is.numeric(BP) & is.numeric(P)))
d <- d[order(d$CHR, d$BP), ]
if (logp) {
d$logp <- -log10(d$P)
} else {
d$logp <- d$P
}
d$pos <- NA
# Fixes the bug where one chromosome is missing by adding a sequential index column.
d$index <- NA
ind <- 0
for (i in unique(d$CHR)) {
ind <- ind + 1
d[d$CHR == i, ]$index <- ind
}
# This section sets up positions and ticks. Ticks should be placed in the
# middle of a chromosome. The a new pos column is added that keeps a running
# sum of the positions of each successive chromsome. For example:
# chr bp pos
# 1 1 1
# 1 2 2
# 2 1 3
# 2 2 4
# 3 1 5
nchr <- length(unique(d$CHR))
if (nchr == 1) {
# For a single chromosome
d$pos <- d$BP
ticks <- floor(length(d$pos)) / 2 + 1
xlabel <- paste("Chromosome", unique(d$CHR), "position")
labs <- ticks
} else {
# For multiple chromosomes
lastbase <- 0
ticks <- NULL
for (i in unique(d$index)) {
if (i == 1) {
d[d$index == i, ]$pos <- d[d$index == i, ]$BP
} else {
lastbase <- lastbase + utils::tail(base::subset(d, index == i - 1)$BP, 1)
d[d$index == i, ]$pos <- d[d$index == i, ]$BP + lastbase
}
# Old way: assumes SNPs evenly distributed
# ticks=c(ticks, d[d$index==i, ]$pos[floor(length(d[d$index==i, ]$pos)/2)+1])
# New way: doesn't make that assumption
ticks <- c(ticks, (min(d[d$index == i, ]$pos) + max(d[d$index == i, ]$pos)) / 2 + 1)
}
xlabel <- "Chromosome"
labs <- unique(d$CHR)
}
manhattanr <- list(
data = d, xlabel = xlabel, ticks = ticks, labs = labs,
nchr = nchr, pName = p,
snpName = if (missing(snp)) NA else snp,
geneName = if (missing(gene)) NA else gene,
annotation1Name = if (missing(annotation1)) NA else annotation1,
annotation2Name = if (missing(annotation2)) NA else annotation2
)
class(manhattanr) <- "manhattanr"
manhattanr
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.