#' Tree visualization for Gene Ontology enrichment results
#'
#' \code{GOTreeVis} creates a GOTree visualisation of GO enrichment data and writes it to a pdf file.
#'
#' @param disp_data Enrichment data to visualize.
#' Four columns: \enumerate{
#' \item Term
#' \item Pvalue - scales the length of the branches
#' \item Count (i.e. number of genes matching the term) - scales the thickness of the branches.
#' If this scaling is not needed, this column should be set to all equal values. All branches will then be set to min_size.
#' \item a column used to determine the side of the tree, e.g. ontology (BP/MF).
#' }
#' In case of a one-sided tree (see oneside parameter) the fourth column can be omitted.
#' The side column should be a factor with two levels (first level = left, second = right).
#' For character side columns a factor will be generated (levels will be in alphabetical order).
#' @param out_file Filename of the output file. The name must end with a valid file extension to define the graphics device. PDF output is recommended.
#' @param b_rootlabel Text for the label at the root of the tree. [default: ""]
#' @param b_sidelabels Text for the labels on each side; The first value corresponds to the left side. [default: ('GO:MF','GO:BP')]
#' @param b_parse Parse text for root and side labels using plotmath. Can be used to plot labels containing superscript or subscript.
#' @param root_color Color for the 'root' box at the bottom of the tree. Applies to text and border. [default: black]
#' @param colors Colors for the two sides of the tree, again first=left. [default: blue/green]
#' @param plot_pvalue_labels Should the p-value labels on the axis be plotted or not? [default: T]
#' @param font Font family to be used for the text in the plot [default: sans]
#' @param oneside Special functionality in case only one category should be plotted.
# possible values: 'l' or 'r', meaning all data will be plotted left or right.
#' @param pval_scaling Scaling factor between -log(pvalue) and branch length; Gets generated automatically from the dimensions of the data if left unset (recommended for first run). Can be used to scale the tree in x dimension. [default: NA]
#' @param br_spacing Spacing between branches. Gets generated automatically from the dimensions of the data if left unset (recommended for first run). Can be used to scale the tree in y dimension. [default: NA]
#' @param tr_width Adjusts the width of the trunk. [default: 4]
#' @param br_min_size Sets the count->branch_width scale. Defines the size(width) of the branch with the smallest count value. [default: 2]
#' @param br_max_size Sets the count->branch_width scale. Defines the size(width) of the branch with the highest count value. [default: 4]
#' @param br_angle Angle of the branches in radians as multiples of pi. [default: 1/7, equal to 180°/7]
#' @param br_textoffset_x Manually offset the text from the branches (x-axis). [default: 0]
#' @param br_textoffset_y Manually offset the text from the branches (y-axis). [default: 0]
#' @param br_offset Adjust the position of the top branch (e.g. if the position should be at the very top). [default: scales with spacing]
#' @param plot_size Plot size, passed to ggsave for both height and width; changes of this value will likely require adjustments of label_size. [default: 8]
#' @param label_size Adjusts the font size of all labels in the plot. [default: 4]
#' @param expand_x [default: (0.1,0.1)] can be used to expand the plot area on each side in case of long labels
#'
#' @return PDF file (out_file)
#' @import ggplot2
#' @export
GOTreeVis <- function(disp_data, out_file,
b_rootlabel = "", b_sidelabels = c("GO:MF", "GO:BP"), b_parse = F,
plot_pvalue_labels = T,root_color = "black", colors = c("deepskyblue3", "seagreen3"),
oneside = NULL, font = "sans",
#pval_scaling = 1.9, br_spacing = 5,
pval_scaling = NA, br_spacing = NA,
tr_width = 4, br_min_size = 2, br_max_size = 4,
br_angle = 1/7, br_textoffset_x = 0, br_textoffset_y = 0, br_offset = NA,
plot_size = 8, label_size = 4, expand_x = c(0.1, 0.1)) {
# internal parameters
tr_x <- 0
tr_yst <- 0
text_col <- 1
pval_col <- 2
count_col <- 3
side_col <- 4
# set up side column, especially in case of a onesided plot. in case of onesided plot expand_x is changed (plot is less wide, so
# higher numbers are needed to expand)
if (!is.null(oneside)) {
disp_data[[side_col]] <- factor(rep(oneside, nrow(disp_data)), levels = c("l", "r"))
if(oneside == "r") expand_x <- c(0.2, 0.6)
else expand_x <- c(0.6, 0.2)
}
# check correctness of data.frame to display.
if (ncol(disp_data) != 4 |
! is.character(disp_data[[text_col]]) | ! is.numeric(disp_data[[pval_col]]) |
! is.numeric(disp_data[[count_col]])) {
stop("Input data.frame has to have exactly 4 columns with the following content and order:
\t1) Term\t\t(character)
\t2) Pvalue\t(numeric)
\t3) Count\t(numeric)
\t4) Side\t\t(recommended: factor with 2 levels).
\t(required type noted in brackets, column names are not important)")
}
if (br_min_size >= br_max_size) {
stop("br_min_size must be different from and smaller than br_max_size!")
}
if(!is.factor(disp_data[[side_col]]))
disp_data[[side_col]] <- as.factor(disp_data[[side_col]])
# set ranks to set branch locations
disp_data$rank <- 0
oldO <- disp_data[1, side_col]
oldR <- 0
for (i in 2:nrow(disp_data)) {
currO <- disp_data[i, side_col]
if (currO == oldO) {
disp_data[i, "rank"] <- oldR + 2
} else {
disp_data[i, "rank"] <- oldR + 1
}
oldR <- disp_data[i, "rank"]
oldO <- currO
}
# derive base values for spacing and p-value scaling from data
if (is.na(br_spacing)){
br_spacing <- round(54 / nrow(disp_data),0)
print(paste("br_spacing set to",br_spacing))
}
# set offset for first branch (if not set it's slightly offset from the top)
if (is.na(br_offset))
br_offset <- br_spacing / 2.7
# trunkdata - plotdata for trunk #### set trunk length based on br_spacing
tr_len <- br_spacing * max(disp_data$rank) + br_spacing / 3
tr_x_offs <- tr_width / 4
tr_x_l <- tr_x - tr_x_offs
tr_x_r <- tr_x + tr_x_offs
trunkdata <- data.frame(xmin = c(tr_x_l, tr_x_r) - tr_x_offs, xmax = c(tr_x_l, tr_x_r) + tr_x_offs,
ymin = tr_yst, ymax = tr_yst + tr_len, side = c("l", "r"), stringsAsFactors = F)
if (!is.null(oneside))
trunkdata <- trunkdata[trunkdata$side == oneside, ]
# base - plotdata for root label ####
base <- data.frame(xst = tr_x, yst = tr_yst, text = b_rootlabel)
# treedata - plotdata for branches ####
# offset for first branch to not start at the very top
top_branch <- tr_yst + tr_len - br_offset
# define scale for count (translates to branch width)
min_count <- min(disp_data[[count_col]])
max_count <- max(disp_data[[count_col]])
# if all values are the same, set dist to 1, in this case all will be scaled to min_size
dist_count <- ifelse(max_count == min_count, 1, max_count - min_count)
dist_size <- br_max_size - br_min_size
# for text labels: offset from the trunk
x_off <- 2.5 + tr_width/8
# generate branch data
treedata <- plyr::adply(disp_data, .margins = 1, .id = NULL, .expand = F, function(row) {
yst <- top_branch - row[["rank"]] * br_spacing
lp <- log(row[[pval_col]], base = 10) # log10(pvalue)
# rescale count to size scale (defined by br_min_size/br_max_size)
size <- br_min_size + (row[[count_col]] - min_count)/dist_count * dist_size
# scale y-offset of text_labels with branch size
text_y_offset <- 0.7 + br_textoffset_y + size/4 + ifelse(is.null(oneside),0,0.2)
if (row[[side_col]] == levels(row[[side_col]])[1]) {
# left branches
br_offset <- br_offset + br_spacing/2
textangle = -br_angle
lineangle = 1 - br_angle
just = 1
side = "l"
trunk_x = tr_x_l
text_x_offset = -(br_textoffset_x + x_off)
} else {
# right branches
textangle = br_angle
lineangle = br_angle
just = 0
side = "r"
trunk_x = tr_x_r
text_x_offset = br_textoffset_x + x_off
}
newBranch <- data.frame(xst = trunk_x, yst = yst,
angle = lineangle,textangle = textangle,
len = -lp, size = size,
text = row[[text_col]], text_x_offset = text_x_offset,
text_y_offset = text_y_offset, just = just,
side = side, stringsAsFactors = F)
return(newBranch)
})
# get maximum -log10(pvalue) for each side
if (is.null(oneside)) {
max_pval_l <- max(treedata[treedata$side == "l", ]$len)
max_pval_r <- max(treedata[treedata$side == "r", ]$len)
} else {
max_pval_l <- max(treedata$len)
max_pval_r <- max_pval_l
}
# maximum p value displayed on the axis (next multiple of 5 after the maximum in the data; at least 10^-10)
ax_pval_max_l <- max(10, ceiling(max_pval_l/5) * 5)
ax_pval_max_r <- max(10, ceiling(max_pval_r/5) * 5)
if(is.na(pval_scaling)) {
sum_x <- ax_pval_max_l + ifelse(is.null(oneside), ax_pval_max_r, 0)
pval_scaling <- round(tr_len / sum_x * 1.5,1)
print(paste("pval_scaling set to",pval_scaling))
}
# scale p-value:
treedata$len <- treedata$len * pval_scaling
# baseaxis - plotdata for p value axis####
# calculate x-axis projections for largest p-values to derive p-value scale on the x-axis (fact_pval_to_xlen).
if (!is.null(oneside)) {
if (oneside == "r") {
graph_xst <- min(treedata$xst - treedata$len * cospi(treedata$angle))
graph_xen <- max(treedata$xst + treedata$len * cospi(treedata$angle))
len_l <- tr_x_l - graph_xst
len_r <- graph_xen - (tr_x_r)
fact_pval_to_xlen <- len_r/max_pval_r
} else if (oneside == "l") {
graph_xst <- min(treedata$xst + treedata$len * cospi(treedata$angle))
graph_xen <- max(treedata$xst - treedata$len * cospi(treedata$angle))
len_l <- tr_x_l - graph_xst
len_r <- graph_xen - (tr_x_r)
fact_pval_to_xlen <- len_l/max_pval_l
}
} else {
graph_xst <- min(treedata$xst + treedata$len * cospi(treedata$angle))
graph_xen <- max(treedata$xst + treedata$len * cospi(treedata$angle))
len_l <- tr_x_l - graph_xst
len_r <- graph_xen - (tr_x_r)
fact_pval_to_xlen <- len_r/max_pval_r # equivalent to len_l/max_pval_l (same scale left and right)
}
# define length of axis by scaling the maximum values displayed
ax_len_l <- fact_pval_to_xlen * ax_pval_max_l
ax_len_r <- fact_pval_to_xlen * ax_pval_max_r
tick_x <- c(-seq(5, ax_pval_max_l, 5), seq(5, ax_pval_max_r, 5)) # create ticks every 5 orders of magnitude
baseaxis <- data.frame(xst = c(tr_x_l - ax_len_l, tr_x_r), yst = tr_yst, angle = 0,
len = c(ax_len_l, ax_len_r), text = "", stringsAsFactors = F)
# add extra line between left and right to make one continuous axis
baseaxis <- rbind(baseaxis, data.frame(xst = tr_x_l, yst = tr_yst, angle = 0,
len = tr_x_r - tr_x_l, text = "", stringsAsFactors = F))
tick_len <- 1 # length of each tick
# add plotdata for each tick
for (tick in tick_x) {
baseaxis <- rbind(baseaxis, data.frame(xst = ifelse(sign(tick) < 0, tr_x_l, tr_x_r) + tick * fact_pval_to_xlen,
yst = tr_yst,
angle = 3/2, len = tick_len, text = paste0("10^{", -abs(tick), "}"), stringsAsFactors = F))
}
# baselabels - plotdata for side labels####
# position labels between ticks in the middle of the axis (same position on both axes, determined by shorter axis)
ax_l<-baseaxis[baseaxis$text!="" & baseaxis$xst < tr_x,"xst"]
ax_r<-baseaxis[baseaxis$text!="" & baseaxis$xst > tr_x,"xst"]
if(length(ax_l) > length(ax_r))
mid<-as.integer(length(ax_r)/2)+1
else
mid<-as.integer(length(ax_l)/2)+1
# position between two ticks:
labeltick_l <- ax_l[mid] - (ax_l[mid] - ax_l[mid-1])/2
labeltick_r <- ax_r[mid] - (ax_r[mid] - ax_r[mid-1])/2
bl_x <- c(labeltick_l, labeltick_r)
baselabels <- data.frame(x = bl_x, y = rep(tr_yst + 0.45, 2), just = 0.5,
text = b_sidelabels, side = c("l", "r"), stringsAsFactors = F)
if (!is.null(oneside)) {
if (oneside == "l") {
baseaxis <- baseaxis[baseaxis$xst < tr_x_l, ]
baselabels <- baselabels[baselabels$x < tr_x_l, ]
} else if (oneside == "r") {
baseaxis <- baseaxis[baseaxis$xst > tr_x_l, ]
baselabels <- baselabels[baselabels$x > tr_x_l, ]
}
}
# create factors here to ensure they have both levels and colors get assigned correctly (especially relevant for the oneside case)
trunkdata$side <- factor(trunkdata$side, levels = c("l", "r"))
treedata$side <- factor(treedata$side, levels = c("l", "r"))
baselabels$side <- factor(baselabels$side, levels = c("l", "r"))
# plot data #### draw the tree trunk
gg <- ggplot() + geom_rect(data = trunkdata, aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = side))
# draw the tree branches
gg <- gg + geom_segment(data = treedata[nrow(treedata):1, ], aes(x = xst, y = yst,
xend = xst + cospi(angle) * len,
yend = yst + sinpi(angle) * len,
size = size, color = side), show.legend = F)
# colored labels for each side
gg <- gg + geom_text(data = baselabels, aes(x = x, y = y, hjust = just, vjust = 0,
label = text, color = side),
family = font, size = label_size, show.legend = F)
# branch labels: GO terms
gg <- gg + geom_text(data = treedata, aes(x = xst + text_x_offset,
y = yst + (text_x_offset * tanpi(angle)) + text_y_offset,
label = text,angle = textangle * 180,
hjust = just), family = font, vjust = 0, size = label_size, show.legend = F)
# p-value axis at the bottom
gg <- gg + geom_segment(data = baseaxis, aes(x = xst, y = yst, xend = xst + len * cospi(angle),
yend = yst + len * sinpi(angle)),
size = 0.4, color = "black", show.legend = F)
# labels for p-value axis
if (plot_pvalue_labels) {
gg <- gg + geom_text(data = baseaxis, aes(x = xst, y = yst - 1.2,
hjust = 1, vjust = 1, label = text),
family=font, size = label_size, angle = 45, show.legend = F, color = "black", parse = T)
}
# root box at the bottom
gg <- gg + geom_label(data = base, aes(x = xst, y = yst, label = text),
size = label_size, nudge_y = 0.1, vjust = 1, parse = b_parse,color = root_color, family=font)
# scale_y_continuous expands the plot in y direction; 10% at the bottom and 20% at the top
gg <- gg + scale_size_identity() +
scale_color_manual(values = colors, guide = F, drop = F) +
scale_fill_manual(values = colors, guide = F, drop = F) +
scale_y_continuous(expand = expand_scale(mult = c(0.2, 0.2))) +
scale_x_continuous(expand = expand_scale(mult = expand_x)) +
coord_fixed(ratio = 1) + theme_void()
ggsave(filename = out_file, plot = gg, width = plot_size, height = plot_size)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.