#' Manhattan plots
#'
#' @param input_data_path Path of the input association data.
#' @param custom_peak_annotation_file_path Path of the custom annotation of variants.
#' @param reference_file_path Path to the 'turboman_hg19_reference_data.rda' / 'turboman_hg19_reference_data.rda' reference file.
#' @param pvalue_sign Significance threshold p-value.
#' @param plot_title Plot title which will be displayed on top of the plot.
#' @param vertical_resolution A fixed number of points (pixels) to be plotted vertically.
#' @export
#' @return NULL
#'
#' @details
#' **Input association data file / input_data_path**
#'
#' Define the path of the input association data.
#' The input data needs to be a file that has:
#' 1. Spaces as field separators.
#' 2. One header line.
#' 3. Option I. (no extreme p-values present): 3 columns, being chromosome, position, pvalue in order, column names are not important.
#' Option II. (extreme p-values present): 5 columns, being chromosome, position, pvalue, beta, se in order, column names are not important.
#'
#'**Custom annotation file / custom_peak_annotation_file_path**
#'
#' Define the path of the custom annotation of variants.
#' The input data needs to be a file that has:
#' 1. Spaces / tabs as field separators.
#' 2. One header line with exact column names (order not important).
#' 3. Columns: chromosome, position, label (e.g., gene name) / nearest_gene_name, cis/trans flag (optional).
#' NB!: If no label is given, variants will be automatically annotated
#'
#' **Reference file / reference_file_path**
#'
#' Define the path to the 'turboman_hg19_reference_data.rda' / 'turboman_hg38_reference_data.rda' reference file that
#' contains the LD block breaks as in \insertCite{berisa16;textual}{pQTLtools}
#' and gene coordinates used to construct and annotate the Manhattan plot. Both
#' are available from the `turboman` directory of the installed package, e.g.,
#' `file.path(find.package('pQTLtools'),'turboman','turboman_hg38_reference_data.rda')`.
#'
#' **Significance threshold p-value / pvalue_sign**
#'
#' Define the significance threshold.
#' This will be used to
#' 1. Highlight signal peaks that come above this significance threshold.
#' 2. Annotate the nearest gene to the top signal in the peak.
#' 3. Draw a horizontal reference line equal to this threshold.
#'
#' **Title of the plot / plot_title**
#'
#' Define title on top of the plot.
#'
#' **Number of pixels on vertical axis / vertical_resolution**
#'
#' Define a fixed number of points (pixels) on vertical axis.
#'
#' @examples
#' \dontrun{
#' # Screen output
#' require(gap.datasets)
#' test <- mhtdata[c('chr','pos','p')]
#' write.table(test,file='test.txt',row.names=FALSE,quote=FALSE)
#' annotate <- subset(mhtdata[c('chr','start','gene','p')],p<5e-8 & gene!='')
#' names(annotate) <- c('chromosome','position','nearest_gene_name','p')
#' write.table(unique(annotate[,-4]),file='annotate.txt',row.names=FALSE,quote=FALSE)
#' input_data_path <- 'test.txt'
#' custom_peak_annotation_file_path <- 'annotate.txt'
#' reference_file_path <-
#' file.path(find.package('pQTLtools'),'turboman','turboman_hg19_reference_data.rda')
#' pvalue_sign <- 5e-8
#' plot_title <- 'gap.datasets example'
#' turboman(input_data_path, custom_peak_annotation_file_path,
#' reference_file_path, pvalue_sign, plot_title)
#' # Figure shown on https://github.com/jinghuazhao/tests/tree/main/turboman
#' png('IL12B.png',width=3600, height=3600, pointsize = 12, res=450)
#' input_data_path <- 'IL.12B.txt.gz'
#' custom_peak_annotation_file_path <- 'IL.12B.annotate'
#' reference_file_path <-
#' file.path(find.package('pQTLtools'),'turboman','turboman_hg19_reference_data.rda')
#' pvalue_sign <- 5e-8
#' plot_title <- 'IL12B'
#' turboman(input_data_path, custom_peak_annotation_file_path,
#' reference_file_path, pvalue_sign, plot_title)
#' dev.off()
#' }
#'
#' @author Arthur Gilly, Chris Finan, Bram Prins, see \href{https://github.com/bpprins/turboman}{https://github.com/bpprins/turboman}.
#'
#' @references
#' \insertAllCited{}
turboman <- function(input_data_path, custom_peak_annotation_file_path, reference_file_path,
pvalue_sign, plot_title, vertical_resolution = 1800)
{
### ================================================================================================================================================###
### 1. Defining input settings ###
### ================================================================================================================================================###
## Verbose printing status bars
fat_status_bar <- strrep("=", 108)
skinny_status_bar <- strrep("-", 108)
print_status_bar <- function(message)
{
cat("\n", fat_status_bar, "\n", message, "\n", fat_status_bar, "\n\n", sep = "")
}
print_status_bar("1. Defining input settings")
## Log start time
start.time <- Sys.time()
print(paste0(" Starting at ", start.time))
## Read in arguments from the command line
for (arg in commandArgs(trailingOnly = TRUE))
{
ta = strsplit(arg, "=", fixed = TRUE)
if (!is.na(ta[[1]][2]))
{
assign(ta[[1]][1], ta[[1]][2])
} else {
stop("Not all arguments are given")
}
}
## Check if the custom peak annotation data file exists
custom_peak_annotation_file_path_exists <- exists("custom_peak_annotation_file_path")
## Assign variable classes
input_data_path <- as.character(input_data_path)
if (custom_peak_annotation_file_path_exists)
{
custom_peak_annotation_file_path <- as.character(custom_peak_annotation_file_path)
}
reference_file_path <- as.character(reference_file_path)
build37 <- grepl("hg19", reference_file_path)
build38 <- grepl("hg38", reference_file_path)
pvalue_sign <- as.numeric(pvalue_sign)
plot_title <- as.character(plot_title)
## Load the reference data
load(reference_file_path)
### ================================================================================================================================================###
### 2. Reading in the plotting data, log10 transform the pvalues, initial
### basic data sanity checks ###
### ================================================================================================================================================###
print_status_bar("2. Reading in the plotting data, log10 transform the pvalues, initial basic data sanity checks")
print(paste0(" Data file path : ", input_data_path))
#---------------------------------------------------------------------
# Reading in association plot data with scan
#---------------------------------------------------------------------
initial_data_dims <- dim(as.data.frame(read.table(input_data_path, header = TRUE,
stringsAsFactors = FALSE, nrows = 10)))[2]
if (initial_data_dims == 3)
{
initial_data <- data.frame(scan(input_data_path, what = list(chromosome = 0,
position = 0, pvalue = 0), skip = 1, sep = " ", quiet = TRUE))
initial_data_contains_beta_se <- FALSE
} else if (initial_data_dims == 5) {
initial_data <- data.frame(scan(input_data_path, what = list(chromosome = 0,
position = 0, pvalue = 0, beta = 0, se = 0), skip = 1, sep = " ", quiet = TRUE))
initial_data_contains_beta_se <- TRUE
} else {
stop("Input data does not have expected dimensions")
}
#---------------------------------------------------------------------
# Reading in peak annotation plot data with read.table and check if it is
# already annotated with labels / gene names (# Check if the variants are
# annotated with gene names
#---------------------------------------------------------------------
if (custom_peak_annotation_file_path_exists)
{
gene_plot_data <- data.frame(read.table(custom_peak_annotation_file_path,
header = TRUE, stringsAsFactors = FALSE))
nearest_gene_names_annotated <- ("nearest_gene_name" %in% colnames(gene_plot_data))
} else {
nearest_gene_names_annotated <- FALSE
}
#---------------------------------------------------------------------
# Preparing the association data
#---------------------------------------------------------------------
## Check if p-values are already logged
if (length(which(initial_data$pvalue > 1)) > 0)
{
initial_data$log_pvalue <- initial_data$pvalue
## Get only the complete data
initial_data <- initial_data[complete.cases(initial_data), ]
# Remove the original pvalues
initial_data$pvalue <- NULL
} else {
## Calculate the -log10 p-value for the input data
initial_data$log_pvalue <- -log10(initial_data$pvalue)
## If beta/SE are provided, and pvalues are missing (because they are
## extreme), log10 P recalculate from beta/SE
missing_pvalues_index <- which((is.na(initial_data$log_pvalue) | initial_data$log_pvalue ==
0))
if (initial_data_contains_beta_se & (length(missing_pvalues_index) > 0))
{
# Calculate expected p-values for missing data
missing_pvalues <- (-log(2, base = 10) - pnorm(-abs(initial_data$beta[missing_pvalues_index]/initial_data$se[missing_pvalues_index]),
log.p = T)/log(10))
# Only replace if indeed they were below the smallest non-zero
# normalized floating-point number
initial_data[missing_pvalues_index, c("log_pvalue")] <- ifelse(missing_pvalues >
-log10(.Machine$double.xmin), missing_pvalues, NA)
}
## Get only the complete data
initial_data <- initial_data[complete.cases(initial_data), ]
# Remove the original pvalues
initial_data$pvalue <- NULL
}
# Calculate -log10 of significance threshold
log_pvalue_sign <- -log10(pvalue_sign)
### ================================================================================================================================================###
### 3. Preparing data for plotting, calculating variables related to
### plotting ###
### ================================================================================================================================================###
print_status_bar("3. Preparing data for plotting, calculating variables related to plotting")
## Set vertical resolution Find the largest p-value, which we will use to
## make the y-axis 'resolution'
observed_log_pvalue_maximum <- max(initial_data$log_pvalue, na.rm = TRUE)
## Now we will scale the resolution for the p-values
log_pvalue_break_size <- observed_log_pvalue_maximum/vertical_resolution
## Create a vector from 0 to the vertical resolution, which we will use to
## bin pvalues
scaling_vector <- seq(0, vertical_resolution, by = log_pvalue_break_size)
## Obtain all unique chromosome numbers for which we have data
chromosomes <- unlist(unique(initial_data$chromosome), recursive = FALSE, use.names = FALSE)
### ================================================================================================================================================###
### 4. Data reduction procedure, creating plotting data ###
### ================================================================================================================================================###
print_status_bar("4. Data reduction procedure, creating plotting data")
## Create an empty dataframe for the plotting data
plot_data <- NULL
plot_data <- as.data.frame(plot_data)
## If the custom annotation data is not given, create an empty data frame
## for annotation downstream which will be filled
if (!custom_peak_annotation_file_path_exists)
{
gene_plot_data <- NULL
gene_plot_data <- as.data.frame(gene_plot_data)
}
## Initialise a counter that keeps track of how many top signals we have
top_snp_counter <- 0
## Define the LD block data (loaded from rda reference data file)
ld_label <- "ld_block_breaks_pickrell_hg"
ld_block_breaks <- get(ls()[substring(ls(), 1, nchar(ld_label)) == ld_label])
if (build37)
ld_block_breaks <- rbind(ld_block_breaks, data.frame(chr = c(23, 23), start = c(1,
gap::hg19[23])))
if (build38)
ld_block_breaks <- rbind(ld_block_breaks, data.frame(chr = c(23, 23), start = c(1,
gap::hg38[23])))
## Define the gene annotation data (loaded from rda reference data file)
ref_label <- "refgene_gene_coordinates_h"
gene_coordinates <- get(ls()[substring(ls(), 1, nchar(ref_label)) == ref_label])
## Start the loop that will go over all unique chromosomes to reduce the
## data
for (chromosome_number in chromosomes)
{
## Verbose progress tracker
johnny_bravo <- ifelse(chromosome_number%%2 == 0, " ha !", " hoo !")
print(paste0("chromosome ", chromosome_number, johnny_bravo))
#---------------------------------------------------------------------
# Reduce the position data for plotting using the LD blocks (X-axis)
#---------------------------------------------------------------------
## Extract the LD block breaks for the chromosome
chromosome_ld_block_breaks <- ld_block_breaks[which(ld_block_breaks[, 1] == chromosome_number), 2]
## Count the number of LD block breaks for the chromosome
number_of_ld_block_bins <- as.integer(length(chromosome_ld_block_breaks) - 1)
## Select the association data for this chromosome to be reduced
initial_data_chromosome <- initial_data[which(initial_data$chromosome == chromosome_number), ]
## Assign bin numbers to the positions based on the LD block breaks
initial_data_chromosome$bin <- findInterval(initial_data_chromosome$position, chromosome_ld_block_breaks)
## Calculate the midpoints of the breaks, which we will use as
## X-coordinates on the Manhattan plot.
plot_x_coordinates <- (head(chromosome_ld_block_breaks, -1) + diff(chromosome_ld_block_breaks)/2)
## Create a temporary dataframe in which we will first assemble the
## plotting data per chromosome
plot_data_per_chromosome_df <- NULL
plot_data_per_chromosome_df <- as.data.frame(plot_data_per_chromosome_df)
## Determine the number of bins in a chromosome, for which we will loop
## over to reduce the p_value data
unique_x_bins <- number_of_ld_block_bins
for (bin_number in 1:unique_x_bins)
{
#-----------------------------------------------------------------------------------
# Reduce the p-value data for plotting to imagined resolution of
# vertical_resolution
#-----------------------------------------------------------------------------------
## Create a temporary dataframe in which we will first assemble the
## plotting data per bin in a chromosome
plot_data_per_bin_in_chromosome_df <- NULL
plot_data_per_bin_in_chromosome_df <- as.data.frame(plot_data_per_bin_in_chromosome_df)
## Now reduce (bin) the p-values (Y-axis values) to
## vertical_resolution bins, and multiply each bin (starting from 1
## to max resolution) by the calculated pvalue_break_size
plot_data_per_bin_in_chromosome_pvalues <-
(unique(.bincode((initial_data_chromosome[["log_pvalue"]][which(initial_data_chromosome$bin ==
bin_number)]), scaling_vector, right = TRUE, include.lowest = FALSE) *
log_pvalue_break_size))
## If there are no p-values for a bin, enter one line with
## chromosome and position, missing pvalue, and missing highlight
## value
n_indexes <- length(plot_data_per_bin_in_chromosome_pvalues)
if (n_indexes == 0) {
plot_data_per_bin_in_chromosome_df[1, 1] <- chromosome_number
plot_data_per_bin_in_chromosome_df[1, 2] <- plot_x_coordinates[bin_number]
plot_data_per_bin_in_chromosome_df[1, 3:5] <- NA
## Name the columns in the plotting data dataframe that was made for this bin
colnames(plot_data_per_bin_in_chromosome_df) <- c("chromosome", "position",
"log_pvalue", "highlight_vector", "highlight_color")
} else {
## First find the top SNP in the bin based on the maximum
## p-value in this bin
largest_pvalue_in_bin <-
max((initial_data_chromosome[["log_pvalue"]][which(initial_data_chromosome$bin == bin_number)]), na.rm = TRUE)[1]
largest_pvalue_index_in_bin <-
which(initial_data_chromosome[["log_pvalue"]][which(initial_data_chromosome$bin ==
bin_number)] == largest_pvalue_in_bin)
## If the largest p-value in the bin is significant, continue
## with finding the matching position for the SNP with highest
## p-value, and it's nearest gene but only if a custom
## annotation file is not given
if ((largest_pvalue_in_bin > log_pvalue_sign) & (!custom_peak_annotation_file_path_exists)) {
## Increase top SNP counter
top_snp_counter = top_snp_counter + 1
## Define the position of the top SNP in the bin
largest_pvalue_index_in_bin_position <-
initial_data_chromosome[which(initial_data_chromosome$bin ==
bin_number), ][largest_pvalue_index_in_bin, ][["position"]][1]
## Extract the gene annotation data from the gene table for
## this particular chromosome
gene_coordinates_chromosome <- gene_coordinates[which(gene_coordinates$chromosome == chromosome_number), ]
## Find the smallest distances to the position of our top SNP
## OLD CODE: smallest_distance_to_gene_for_top_snp_in_bin <-
## min(abs(gene_coordinates_chromosome$gene_transcription_midposition-largest_pvalue_index_in_bin_position),na.rm=TRUE)
smallest_distance_to_gene_start_for_top_snp_in_bin <- min(abs(gene_coordinates_chromosome$gene_transcription_start -
largest_pvalue_index_in_bin_position), na.rm = TRUE)
smallest_distance_to_gene_stop_for_top_snp_in_bin <- min(abs(gene_coordinates_chromosome$gene_transcription_stop -
largest_pvalue_index_in_bin_position), na.rm = TRUE)
## Find which gene corresponds to the smallest distances to
## the position of our top SNP OLD CODE:
## genename_for_top_snp_in_bin<-as.character(gene_coordinates_chromosome[which(abs(gene_coordinates_chromosome$gene_transcription_midposition-largest_pvalue_index_in_bin_position)==smallest_distance_to_gene_for_top_snp_in_bin),c('gene_name')])[1]
if (smallest_distance_to_gene_start_for_top_snp_in_bin < smallest_distance_to_gene_stop_for_top_snp_in_bin) {
genename_for_top_snp_in_bin <-
as.character(gene_coordinates_chromosome[which(abs(gene_coordinates_chromosome$gene_transcription_start -
largest_pvalue_index_in_bin_position) == smallest_distance_to_gene_start_for_top_snp_in_bin),
c("gene_name")])[1]
} else {
genename_for_top_snp_in_bin <-
as.character(gene_coordinates_chromosome[which(abs(gene_coordinates_chromosome$gene_transcription_stop -
largest_pvalue_index_in_bin_position) == smallest_distance_to_gene_stop_for_top_snp_in_bin),
c("gene_name")])[1]
}
## Enter the chromosome of the top SNP in the gene annotation
## dataframe which we will use in the plot
gene_plot_data[top_snp_counter, 1] <- chromosome_number
## Enter the mid-bin coordinate for the top SNP in the gene
## annotation dataframe which we will use in the plot
gene_plot_data[top_snp_counter, 2] <- plot_x_coordinates[bin_number]
## Enter the pvalue for the top SNP in the gene annotation
## dataframe which we will use in the plot
gene_plot_data[top_snp_counter, 3] <- largest_pvalue_in_bin
## Enter the nearest gene for the top SNP in the gene
## annotation dataframe which we will use in the plot
gene_plot_data[top_snp_counter, 4] <- genename_for_top_snp_in_bin
## Name the columns of the gene annotation dataframe
colnames(gene_plot_data) <- c("chromosome", "position", "log_pvalue",
"nearest_gene_name")
}
## Else if there are p-values for a bin, enter the chromosome,
## the midposition for this bin, the pvalue, and vector values
## telling whether this bin should be highlighted in the plot
indexes <- 1:n_indexes
plot_data_per_bin_in_chromosome_df[indexes, 1] <- rep(chromosome_number, n_indexes)
plot_data_per_bin_in_chromosome_df[indexes, 2] <- rep(plot_x_coordinates[bin_number], n_indexes)
plot_data_per_bin_in_chromosome_df[indexes, 3] <- plot_data_per_bin_in_chromosome_pvalues
plot_data_per_bin_in_chromosome_df[indexes, 4] <- ifelse(largest_pvalue_in_bin >
log_pvalue_sign, rep(1, n_indexes), rep(0, n_indexes))
plot_data_per_bin_in_chromosome_df[indexes, 5] <- ifelse(largest_pvalue_in_bin >
log_pvalue_sign, rep(1, n_indexes), rep(0, n_indexes))
## Name the columns in the plotting data dataframe that was
## made for this bin
colnames(plot_data_per_bin_in_chromosome_df) <- c("chromosome", "position",
"log_pvalue", "highlight_vector", "highlight_color")
}
## Add the per-bin plotting data dataframe to the per-chromosome
## plotting data dataframe
plot_data_per_chromosome_df <- rbind(plot_data_per_chromosome_df, plot_data_per_bin_in_chromosome_df)
## Remove the per-bin plotting data dataframe
rm(plot_data_per_bin_in_chromosome_df)
}
## Add the per-chromosome plotting data dataframe to the per-chromosome
## plotting data dataframe
plot_data <- rbind(plot_data, plot_data_per_chromosome_df)
## Remove the per-chromosome plotting data dataframe
rm(plot_data_per_chromosome_df)
rm(initial_data_chromosome)
}
#-----------------------------------------------------------------------------------------
# Processing custom annotation for variants
#-----------------------------------------------------------------------------------------
## If the custom peak annotation file exists (chromosomes and positions of
## variants provided), but there are no labels present, perform an
## annotation for nearest genes and look up their pvalues in the
## association data
if (custom_peak_annotation_file_path_exists & !nearest_gene_names_annotated)
{
## Create an additional column filled with NA
gene_plot_data$nearest_gene_name <- NA
## Count how many variants should be annotated
number_of_peak_annotations <- dim(gene_plot_data)[1]
## Loop over the variants, find the nearest genes and p-values of these
## variants in the association data
for (peak_number in 1:number_of_peak_annotations)
{
peak_snp_chromosome <- gene_plot_data$chromosome[peak_number]
peak_snp_position <- gene_plot_data$position[peak_number]
## Extract the gene annotation data from the gene table for this
## particular chromosome
gene_coordinates_chromosome <- gene_coordinates[which(gene_coordinates$chromosome ==
peak_snp_chromosome), ]
## Find the smallest distances to the position of our top SNP
smallest_distance_to_gene_for_top_snp_in_bin <- min(abs(gene_coordinates_chromosome$gene_transcription_midposition -
peak_snp_position), na.rm = TRUE)
## Find which gene corresponds to the smallest distances to the
## position of our top SNP
genename_for_top_snp_in_bin <-
as.character(gene_coordinates_chromosome[which(abs(gene_coordinates_chromosome$gene_transcription_midposition -
peak_snp_position) == smallest_distance_to_gene_for_top_snp_in_bin),
c("gene_name")])[1]
gene_plot_data$nearest_gene_name[peak_number] <- genename_for_top_snp_in_bin
## Also find the p-vals !!!
gene_plot_data$log_pvalue[peak_number] <-
initial_data[which(initial_data$chromosome == peak_snp_chromosome &
initial_data$position == peak_snp_position), c("log_pvalue")][1]
}
} else if (custom_peak_annotation_file_path_exists & nearest_gene_names_annotated) {
## Count how many variants should be annotated
number_of_peak_annotations <- dim(gene_plot_data)[1]
## Loop over the variants, find the nearest genes and p-values of these
## variants in the association data
for (peak_number in 1:number_of_peak_annotations)
{
peak_snp_chromosome <- gene_plot_data$chromosome[peak_number]
peak_snp_position <- gene_plot_data$position[peak_number]
## Also find the p-vals !!!
gene_plot_data$log_pvalue[peak_number] <-
initial_data[which(initial_data$chromosome == peak_snp_chromosome &
initial_data$position == peak_snp_position), c("log_pvalue")][1]
}
} else {
cat("\n Custom annotation data fully provided\n\n")
}
### ================================================================================================================================================###
### 5. Start plotting ###
### ================================================================================================================================================###
print_status_bar("5. Start plotting")
## Define image properties Sorting the plotting data from the GWAS datafile
## and the gene annotation file
plot_data <- plot_data[order(plot_data$chromosome, plot_data$position), ]
if (dim(gene_plot_data)[1] > 0)
{
gene_plot_data <- gene_plot_data[order(gene_plot_data$chromosome, gene_plot_data$position),
]
}
## Putting the data in vectors, easier to work with
chromosomes <- plot_data$chromosome
positions <- plot_data$position
log_pvalues <- plot_data$log_pvalue
highlight_vector <- plot_data$highlight_vector
unique_chromosomes <- unique(chromosomes)
## Defining the Y-axis maxima to be used for the Y-axis limit to plot the
## association data and the gene annotation Truncate the maximum p-value to
## an integer
log_pvalue_truncated <- trunc(max(log_pvalues, na.rm = TRUE))
# Use the truncated maximum p-value to define nice Y-axis limits
y_axis_plot_data_limit <- ifelse(log_pvalue_truncated < 10, 10, ifelse(log_pvalue_truncated <
15, 15, ifelse(log_pvalue_truncated < 15, 15, (((trunc(log_pvalue_truncated/10) +
1) * 10)))))
## Define the limits for the gene annotations / lines and start of gene
## name display and real size of plot window
y_axis_stop_gene_annotation_vertical_lines <- y_axis_plot_data_limit
y_axis_stop_gene_annotation_diagonal_lines <- y_axis_plot_data_limit * 1.1
y_axis_true_limit <- y_axis_plot_data_limit * 1.3
### Setting the plotting data How many chromosomes / plot only the
### chromosomes for which there's data
max_nchr <- length(unique(chromosomes))
## Set indices
x <- 1:max_nchr
x2 <- 1:max_nchr
## Define the actual plotting X coordinates as will be used in the plot,
## based on basepair positions of the variants
for (i in 1:max_nchr)
{
chromosome_number = which(chromosomes == i)
x[i] <- trunc((max(na.omit(positions[chromosome_number])))/100) + 1e+05
x2[i] <- trunc((min(na.omit(positions[chromosome_number])))/100) - 1e+05
}
x[1] <- x[1] - x2[1]
x2[1] <- 0 - x2[1]
for (i in 2:(max_nchr + 1))
{
x[i] <- x[i - 1] - x2[i] + x[i]
x2[i] <- x[i - 1] - x2[i]
}
## Calculate the final x-coordinates of the association data to plot
x_coordinates <- trunc(positions/100) + x2[chromosomes]
## Define the x-axis limit
x_axis_limit <- max(x_coordinates, na.rm = TRUE) - min(x_coordinates, na.rm = TRUE)
## Set the final y-coordinates of the association data to plot
y_coordinates <- log_pvalues
## Calculate the final x-coordinates of the top SNPs with annotated genes
## to plot
if (nrow(gene_plot_data) > 0)
{
gene_x_coordinates <- trunc(gene_plot_data$position/100) + x2[gene_plot_data$chromosome]
## Set the final y-coordinates of the top SNPs with annotated genes to
## plot
gene_y_coordinates <- gene_plot_data$log_pvalue
## Set the nearest gene names of the top SNPs to plot
nearest_gene_names_hits <- gene_plot_data$nearest_gene_name
nearest_gene_names_cistrans <- rep("trans", nrow(gene_plot_data))
if (!is.null(gene_plot_data$cistrans))
nearest_gene_names_cistrans <- gene_plot_data$cistrans
}
## Set the default colors of the all association datapoints, grey and light
## grey, alternating between odd and even chromosome numbers
col1 <- "gray72"
col2 <- "gray50"
chromosome_colour <- ifelse(chromosomes%%2 == 0, col1, col2)
## Plot the association data
plot(x_coordinates, y_coordinates, pch = 20, col = chromosome_colour, axes = F,
ylab = "", xlab = "", bty = "n", ylim = c(0, y_axis_true_limit), cex = 0.8,
main = plot_title)
## Plot the gene annotation data
if (nrow(gene_plot_data) > 0)
{
## Create X-axis breaks at which the gene names will be plotted,
## chosing random number of 150 as I simply Assume maximally 150 peaks
## will be annotated and at 150 genes I hope no gene names will be
## displayed overlapping
x_axis_break_factor <- x_axis_limit/150
## Draw top SNP gene annotation lines
y_axis_stop_gene_annotation_vertical_lines <- y_axis_plot_data_limit
y_axis_stop_gene_annotation_diagonal_lines <- y_axis_plot_data_limit * 1.1
y_axis_true_limit <- y_axis_plot_data_limit * 1.3
for (i in 1:length(gene_x_coordinates))
{
## Define the coordinates for the vertical annotation lines
vertical_annotation_line_x_coordinate_start <- gene_x_coordinates[i]
vertical_annotation_line_x_coordinate_stop <- gene_x_coordinates[i]
vertical_annotation_line_y_coordinate_start <- gene_y_coordinates[i]
vertical_annotation_line_y_coordinate_stop <- y_axis_stop_gene_annotation_vertical_lines
## Define the diagonal for the vertical annotation lines
diagonal_annotation_line_x_coordinate_start <- gene_x_coordinates[i]
diagonal_annotation_line_x_coordinate_stop <- ((x_axis_limit/length(gene_x_coordinates))/2) +
((i - 1) * (x_axis_limit/length(gene_x_coordinates)))
diagonal_annotation_line_y_coordinate_start <- y_axis_stop_gene_annotation_vertical_lines
diagonal_annotation_line_y_coordinate_stop <- y_axis_stop_gene_annotation_diagonal_lines
## Draw the vertical annotation lines
lines(c(vertical_annotation_line_x_coordinate_start, vertical_annotation_line_x_coordinate_stop),
c(vertical_annotation_line_y_coordinate_start, vertical_annotation_line_y_coordinate_stop),
col = "grey", lty = 2, lwd = 1)
## Draw the diagonal annotation lines
lines(c(diagonal_annotation_line_x_coordinate_start, diagonal_annotation_line_x_coordinate_stop),
c(diagonal_annotation_line_y_coordinate_start, diagonal_annotation_line_y_coordinate_stop),
col = "grey", lty = 2, lwd = 1)
## Plot the gene names for each top SNP calculating font sizes
number_of_annotations_to_plot <- dim(gene_plot_data)[1]
maximum_characters_annotation <- max(nchar(gene_plot_data$nearest_gene_name),
na.rm = TRUE)
if ((number_of_annotations_to_plot <= 70) & (maximum_characters_annotation <=
9)) {
gene_label_cex_size <- 1
} else if ((number_of_annotations_to_plot > 70) & (maximum_characters_annotation <=
9)) {
gene_label_cex_size <- 1.3 - (0.006 * number_of_annotations_to_plot)
} else if ((number_of_annotations_to_plot <= 70) & (maximum_characters_annotation >
9)) {
gene_label_cex_size <- 1.15 - (0.03264 * maximum_characters_annotation)
} else ((number_of_annotations_to_plot > 70) & (maximum_characters_annotation >
9))
gene_label_cex_size_n_genes_annotation <- 1.3 - (0.006 * number_of_annotations_to_plot)
gene_label_cex_size_max_char_annotation <- 1.15 - (0.03264 * maximum_characters_annotation)
gene_label_cex_size <- ifelse(gene_label_cex_size_n_genes_annotation <
gene_label_cex_size_max_char_annotation, gene_label_cex_size_n_genes_annotation,
gene_label_cex_size_max_char_annotation)
# Plot the labels
text(diagonal_annotation_line_x_coordinate_stop, diagonal_annotation_line_y_coordinate_stop,
labels = nearest_gene_names_hits[i], cex = gene_label_cex_size, srt = 90,
adj = c(0, 0.5), font = 3, ps = 12, col = ifelse(nearest_gene_names_cistrans[i] ==
"cis", "red", "blue"))
}
}
## Draw the significanc threshold
lines(c(0, max(x_coordinates, na.rm = TRUE)), c(log_pvalue_sign, log_pvalue_sign),
col = "dodgerblue4", lty = 2, lwd = 1)
## Draw horizontal axis(can't find a way to nicely draw a x-axis without
## not running over points:
lines(c(0, max(x_coordinates, na.rm = TRUE)), c((y_axis_plot_data_limit/200) *
-1, (y_axis_plot_data_limit/200) * -1), col = "black", lty = 1, lwd = 1)
## Highlight the significant peaks
points(x_coordinates[which(highlight_vector == 1)], y_coordinates[which(highlight_vector ==
1)], col = "dodgerblue4", pch = 20, cex = 0.8)
## Make (non-existing) x-axis text and labels
x_axis_chromosome_labels_text <- c(1:22, "X", "Y XY M", "", "")
x_axis_chromosome_labels <- x_axis_chromosome_labels_text[unique_chromosomes]
for (i in 1:max_nchr)
{
label_positions = (x[i] + x2[i])/2
mtext(x_axis_chromosome_labels[i], 1, at = label_positions, cex = 1, line = 0, las = 2)
}
## Set the horizontal axis text - 'Chromosome' and each chromosome number
## for which the data is plotted
mtext("Chromosome", 1, at = x[max_nchr]/2, cex = 1, line = 2)
## Draw the y-axis with value ticks
axis(2, las = 2, pos = 0, yaxp = c(0, y_axis_plot_data_limit, 10))
## Draw the y-axis label
mtext(expression(paste(-"log"[10], " p-value")), 2, line = 1)
## Calculate how much time things took
end.time <- Sys.time()
time.taken <- difftime(end.time, start.time, units = "mins")
cat("It took", time.taken, "minutes to complete this job\n")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.