Nothing
# QCEWAS v1.2-3
# Created by Peter van der Most, April 2015
# Updated December 2016, August 2018, February 2019, January-February 2023
# Based on code for the QCGWAS package
# Remember to adjust version number here and in output log, and activate onAttach functie
# don't export the zf_testLogical function
.onAttach <- function(libname, pkgname) {
packageStartupMessage("")
packageStartupMessage("QCEWAS library, version 1.2-3")
packageStartupMessage(paste0("A Quick Start Guide can be found in ", system.file("doc", package = "QCEWAS")))
packageStartupMessage("")
}
# zf_testLogical is an internal function to test if arguments are logical
zf_testLogical <- function(param, notNA = TRUE){
OK <- is.logical(param) & is.vector(param) & length(param) == 1L
if(OK & notNA) { OK <- !is.na(param) }
return(OK)
}
# Function P_correlation is no longer the same check_p as in QCGWAS. The save name has been changed, the log entries & HQ subset removed and the colnames have been updated
P_correlation <- function(dataset,
plot_correlation = TRUE, plot_if_threshold = FALSE, threshold_r = 0.99,
high_quality_plots = FALSE,
save_name = "dataset", header_translations, ...) {
if(!missing(header_translations)) {
header_test <- translate_header(header = colnames(dataset), standard = c("P_VAL", "BETA", "SE"), alternative = header_translations)
if(any(duplicated(header_test$header_h))) stop("cannot translate header - duplicate column names")
if(header_test$missing_N > 0L) stop(paste("Cannot identify data column(s):", paste(header_test$missing_h, collapse = ", ")))
colnames(dataset) <- header_test$header_h
}
goodOnes <- !(is.na(dataset$P_VAL) | is.na(dataset$BETA) | is.na(dataset$SE))
if(sum(goodOnes) > 10L) {
p_obs <- -log10(dataset$P_VAL[goodOnes])
p_exp <- -log10(pchisq((dataset$BETA[goodOnes]/dataset$SE[goodOnes])^2, 1, lower.tail=FALSE))
if(any(p_obs > 300)) {
p_obs[p_obs > 300] <- 300
warning("Extreme p-values in dataset - temporarly set to 1e-300 for correlation calculation and plot")
}
if(any(p_exp > 300)) {
p_exp[p_exp > 300] <- 300
warning("Expected p-values capped at 1e-300")
}
p_cor <- cor(p_obs, p_exp, use = "everything")
if(is.na(p_cor)){
warning("Unable to calculate p-value correlation")
return(NA) # uses return here so that the if loops below are skipped
}
if(p_cor < threshold_r) {
print(paste0("Warning: reported p-values correlate poorly to expected values(r = ", round(p_cor, digits = 3), ")"), quote = FALSE)
}
if(plot_correlation & (p_cor < threshold_r | !plot_if_threshold )) {
p_max <- max(p_obs, p_exp)
if(high_quality_plots){
tiff(filename = paste0(save_name, ".graph_p.tiff"), units = "in", width = 7, height = 7, res = 350,
compression = "lzw")
} else {
png(paste0(save_name, ".graph_p.png"),
width = 720, height = 720, res = 144)
}
plot(x = 0, y = 0, col = "white",
xlim = c(0, p_max), ylim = c(0, p_max),
main="P-value correlation", xlab = "Expected -log10(p)", ylab = "Observed -log10(p)",
sub = save_name, cex.sub = 1.3, ...)
lines(x = c(0, 1.1 * p_max), y = c(0, 1.1 * p_max))
points(p_exp, p_obs, col = "black", pch = 20, cex = 0.8)
text(0.0 * p_max, 0.95 * p_max, paste("r =", round(p_cor, digits = 3)), pos = 4, cex=1.0, col = ifelse(p_cor < threshold_r, "red", "black") )
dev.off()
}
} else {
print("p-test aborted: less than 10 entries with the data required to calculate corelation", quote = FALSE)
p_cor <- NA
}
return(p_cor)
}
# Function EWAS_plots is an edited version of QC_plots from QCGWAS.
# EWAS_plots was edited to prevent a bug from occuring, and to suppress messages
# as well as suppresing an underline in the QQ plot
# also removed a HQ filter call
# lateron, we removed all code relating to the filters
P_lambda <- function(p){
p <- p[!is.na(p)]
if(length(p) < 2L) stop("'p' does not contain sufficient non-missing values to calculate lambda")
return(median(qchisq(p, df=1, lower.tail=FALSE)) / qchisq(0.5, 1))
}
EWAS_plots <- function(dataset, plot_QQ = TRUE, plot_Man = TRUE,
plot_cutoff_p = 0.05,
plot_QQ_bands = FALSE,
high_quality_plots = FALSE,
save_name = "dataset",
header_translations) {
if(is.vector(dataset)) {
dataset <- data.frame(P_VAL = dataset)
if(plot_Man){
plot_Man <- FALSE
warning("Cannot create a Manhattan plot without chromosome/position values")
}
}
colnames(dataset) <- toupper(colnames(dataset))
header_std <- c("P_VAL", "CHR", "MAPINFO")[c(TRUE, plot_Man, plot_Man)]
if(missing(header_translations)) {
header_missing <- header_std[!header_std %in% colnames(dataset)]
if(length(header_missing) > 0L) { stop(paste0("Missing columns: ", paste0(header_missing, collapse = ", "))) }
} else {
header_test <- translate_header(header = colnames(dataset), standard = header_std, alternative = header_translations)
if(any(duplicated(header_test$header_h))) stop(paste0("Cannot translate header - duplicate columns: ", paste0(header_test$header_h[duplicated(header_test$header_h)], collapse = ", ") ))
if(header_test$missing_N > 0L) stop(paste("Cannot identify data column(s):", paste(header_test$missing_h, collapse = ", ")))
colnames(dataset) <- header_test$header_h
}
# Stage 1: creating filters & calculating lambda
# Calculating lambda
# First the script creates a short-list by throwing out NA's (!) to calculate the lambda's,
# and a p-cutoff filter is created (!). The script checks whether there are still more than
# 10 datapoints available at both exclamation marks. If yes, the script uses the shortlist
# to generate the unfiltered expected and observed QQ plots; and the long list plus the
# addQQplot function to generate the filtered plots.
# WARNING: note that the content of QQ_obs_short changes: it starts out as the p-values
# minus NA (*), then -log10 values (#), then -log10 values minus p > 0.05 (***). Lambda needs
# to be calculated at the (*). QQ_exp is calculated at (#), but (*) works too; the contents
# of the vector used to calculate QQ_exp do not matter, only its length. QQ_exp changes
# as well: a copy without > 0.05 is made at (***: QQ_exp_short). The copy is used for
# the graphs, but QQ_exp is necessary for the probability clouds. If so, it's shortened
# to 1000 points to prevent the polygon functions from being overloaded.
QQ_obs_p <- dataset$P_VAL
QQ_obs_short <- subset(QQ_obs_p, !is.na(QQ_obs_p))
QQ_obs_N <- length(QQ_obs_short) # N of non-missing p's
if(QQ_obs_N > 10L) {
lambda <- P_lambda(QQ_obs_short)
if(any(QQ_obs_short < 1e-300)) {
warning("Extreme p-values in dataset - temporarly set to 1e-300 for QQ & Manhattan plots")
QQ_obs_short <- ifelse(QQ_obs_short < 1e-300, 1e-300, QQ_obs_short)
dataset$P_VAL <- ifelse(dataset$P_VAL < 1e-300 & !is.na(dataset$P_VAL), 1e-300, dataset$P_VAL)
}
QQ_obs_short<- sort(-log10(QQ_obs_short))
QQ_incl <- QQ_obs_short >= -log10(plot_cutoff_p)
} else {
lambda <- NA
QQ_incl <- 0
}
# Stage 2: creating QQ plots
if(plot_QQ & sum(QQ_incl) > 10L ) {
QQ_exp <- sort(-log10(ppoints(QQ_obs_N)))
QQ_exp_short<- QQ_exp[QQ_incl]
QQ_obs_short<- QQ_obs_short[QQ_incl]
QQ_exp_min <- QQ_exp_short[1]
QQ_exp_max <- QQ_exp_short[length(QQ_exp_short)]
QQ_obs_min <- QQ_obs_short[1]
QQ_obs_max <- QQ_obs_short[length(QQ_obs_short)]
if(plot_QQ_bands) {
temp <- (1:QQ_obs_N)
i1000 <- c(1, (1:1000) * floor(QQ_obs_N / 1000), QQ_obs_N)
QQ_band_upper <- sort(-log10(qbeta( 1 - 0.05 / 2, temp, QQ_obs_N - temp + 1 ) ) )[i1000]
QQ_band_lower <- sort(-log10(qbeta( 0.05 / 2, temp, QQ_obs_N - temp + 1 ) ) )[i1000]
QQ_exp <- QQ_exp[i1000]
}
if(high_quality_plots){
tiff(filename = paste0(save_name, ".graph_QQ.tiff"), units = "in", width = 7, height = 7, res = 350,
compression = "lzw")
} else {
png(paste0(save_name, ".graph_QQ.png"),
width = 720, height = 720, res = 144)
}
plot(c(QQ_exp_min, QQ_exp_max), c(QQ_obs_min, QQ_obs_max), xlim = c(0, QQ_exp_max), ylim = c(0, QQ_obs_max),
main = "QQ plot", xlab = "Expected -log10(p-value)", ylab = "Observed -log10(p-value)",
pch = 1, col = "black", sub = save_name, cex.sub = 1.3)
if(plot_QQ_bands) {
polygon( c(QQ_exp, rev(QQ_exp)), c( QQ_band_upper, rev(QQ_exp)), col="grey", border = NA )
polygon( c(QQ_exp, rev(QQ_exp)), c( QQ_band_lower, rev(QQ_exp)), col="grey", border = NA )
}
points(QQ_exp_short, QQ_obs_short, pch = 1, col = "black")
abline(0,1)
text(0, 0.98 * QQ_obs_max, substitute(paste(lambda, " = ", x), list(x=round(lambda, digits = 3))),
pos = 4, col = ifelse(lambda > 2 | lambda < 0.8, "red", "black"))
#abline(h = -log10(0.05/QQ_obs_N), # -log10(5e-8), # Bonferroni threshold
# lty = 3, col="red")
dev.off()
} else { if(plot_QQ) warning("Insufficient significant p-values to create QQ plot") }
# Stage 3: creating Manhattan plot
if(plot_Man & sum(QQ_incl) > 10L ) {
man_set <- dataset[!is.na(dataset$MAPINFO) & !is.na(dataset$CHR) & !is.na(dataset$P_VAL) & dataset$P_VAL <= plot_cutoff_p, c("CHR", "MAPINFO", "P_VAL")]
if(!is.numeric(man_set$CHR)) {
if(is.factor(man_set$CHR)) { man_set$CHR <- as.character(man_set$CHR) }
if(is.character(man_set$CHR)){
man_set$CHR <- toupper(man_set$CHR)
man_set$CHR[man_set$CHR == "X"] <- 23L
man_set$CHR[man_set$CHR == "Y"] <- 24L
man_set$CHR[man_set$CHR == "XY"] <- 25L
man_set$CHR[man_set$CHR %in% c("M", "MT")] <- 26L
}
man_set$CHR <- as.integer(man_set$CHR)
if(any(is.na(man_set$CHR))) {
warning(paste0(sum(is.na(man_set$CHR)), " entries with untranslatable chromosome values"))
man_set <- man_set[!is.na(man_set$CHR), ]
}
}
if(any(!man_set$CHR %in% 1:26)) {
warning(paste0(sum(!man_set$CHR %in% 1:26), " entries with chromosome values outside of the normal range"))
man_set <- man_set[man_set$CHR %in% 1:26, ]
}
manhattanN <- nrow(man_set)
if(manhattanN > 9L) { # testing if there are sufficient p-values < 0.05
chr_size <- data.frame(chromosome = 1:27, # chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 X23 Y24 XY25,M26, 27 = end of M
size = c(247249719, 242951149, 199501827, 191273063, 180857866, 170899992, 158821424, 146274826, 140273252, 135374737, 134452384, 132349534, 114142980, 106368585, 100338915, 88827254, 78774742, 76117153, 63811651, 62435964, 46944323, 49691432, 154913754, 57772954, 0, 0, 0 ),
start= c( 500000, 248249719, 491700868, 691702695, 883475758, 1064833624, 1236233616, 1395555040, 1542329866, 1683103118, 1818977855, 1953930239, 2086779773, 2201422753, 2308291338, 2409130253, 2498457507, 2577732249, 2654349402, 2718661053, 2781597017, 2829041340, 2879232772, 3034646526, NA, NA, NA))
new_pos <- integer(length = manhattanN)
for (mi in 1:24 ) new_pos <- ifelse(man_set$CHR==chr_size$chromosome[mi], chr_size[mi, 3] + man_set$MAPINFO, new_pos)
use_Y <- any(man_set$CHR == 24L)
use_XY <- any(man_set$CHR == 25L)
use_M <- any(man_set$CHR == 26L)
man_label <- c(1:22, "X")
at_label <- chr_size$start[2:24] - 250000 - ( chr_size$start[2:24] - chr_size$start[1:23] ) / 2
if(use_Y) {
man_label <- c(man_label, "Y")
at_label <- c(at_label, chr_size$start[25] - 250000 - ( chr_size$start[25] - chr_size$start[24] ) / 2)
chr_size$start[25] <- chr_size$start[24] + chr_size$size[24] + 500000
} else { chr_size$start[25] <- chr_size$start[24] }
if(use_XY) {
man_label <- c(man_label, "XY")
at_label <- c(at_label, chr_size$start[26] - 250000 - ( chr_size$start[26] - chr_size$start[25] ) / 2)
chr_size$size[25] <- max(man_set$MAPINFO[man_set$CHR == 25])
chr_size$start[26] <- chr_size$start[25] + chr_size$size[25] + 500000
} else { chr_size$start[26] <- chr_size$start[25] }
if(use_M ) {
man_label <- c(man_label, "M")
at_label <- c(at_label, chr_size$start[27] - 250000 - ( chr_size$start[27] - chr_size$start[26] ) / 2)
chr_size$size[26] <- max(man_set$MAPINFO[man_set$CHR == 26])
chr_size$start[27] <- chr_size$start[26] + chr_size$size[26] + 500000
} else { chr_size$start[27] <- chr_size$start[26] }
manMax <- ceiling(-log10(min(man_set$P_VAL)))
if(manMax < 10L) manMax <- 10L
if(high_quality_plots){
tiff(filename = paste0(save_name, ".graph_M.tiff"), units = "in", width = 14, height = 7, res = 350,
compression = "lzw")
} else {
png(paste0(save_name, ".graph_M.png"),
width = 960, height = 480)
}
par(mfrow = c(1,1), mgp = c(2.5, 0.9, 0))
palette(c("red", "green3", "blue", "cyan"))
plot(new_pos, -log10(man_set$P_VAL), pch = 20, xaxs = "i", xaxt = "n", ylim = c(1, manMax),
xlim = c(0, chr_size$start[27]), col = man_set$CHR, cex.lab = 1.8, cex.axis = 1.4,
xlab = "Chromosome", ylab = "Observed -log10(p-value)", main = "Manhattan plot", sub = save_name, cex.sub = 1.3, col.sub = "red")
abline(v = chr_size[2:26, 3] - 250000, lty = 2)
abline(h = -log10(0.05/QQ_obs_N), # -log10(5e-8), # Bonferroni threshold
lty = 3, col="red")
axis(1, at = at_label, labels = man_label, cex.axis = 1.5)
dev.off()
} else warning("Insufficient SNPs remain to create Manhattan plot")
} else {
if(plot_Man) warning("Insufficient SNPs remain to create Manhattan plot")
#manhattanN <- NA
}
return(invisible(lambda))
}
# Function translate_header has the standard argument altered & the identify_column function made internal
translate_header <- function(header, standard = c("PROBEID","BETA","SE","P_VAL"), alternative){
if(any(duplicated(alternative[ ,2]))) stop("duplicated elements in alternative, column 2")
capitalized <- toupper(header)
unknowns <- !logical(length = length(header))
missings <- logical(length = length(standard))
identify_column <- function(std_name, alt_names, header) { return(which(header %in% c(alt_names[ which(alt_names[ ,1]==std_name), 2], std_name))) }
for(forI in 1:length(standard)) {
column_no <- identify_column(std_name = standard[forI], alt_names = alternative, header = capitalized)
if(length(column_no) == 0L) {
missings[forI] <- TRUE
} else {
header[column_no] <- standard[forI]
unknowns[column_no] <- FALSE
}
}
return(list(header_N = length(header), header_h = header,
missing_N = sum(missings), missing_h = if(sum(missings) == 0L) NULL else standard[missings],
unknown_N = sum(unknowns), unknown_h = if(sum(unknowns) == 0L) NULL else header[unknowns] ) )
}
EWAS_QC <- function(data, # datatable with ewas results, or filename of the same
map, # datatable with chr & pos of CpGs in idata or filename of the same
outputname, # characterstring to identify dataset in output
header_translations, # translation table for file headers
threshold_outliers = c(NA, NA),
markers_to_exclude,
exclude_outliers = FALSE,
exclude_X = FALSE,
exclude_Y = FALSE,
save_final_dataset = TRUE,
gzip_final_dataset = TRUE,
header_final_dataset = "standard",
high_quality_plots = FALSE,
return_beta = FALSE,
N_return_beta = 500000L,
...){
### Stage 0: checking input
# 0: Prepation: creating functions, checking params & output template
zv_startime <- date()
use_map <- !missing(map)
if(use_map) { use_map <- !is.null(map) }
zf_emergencyExit <- function(value, message){ # removed the warning to avoid duplicates (and it's less than informative) since the name of zf_EmergencyExit is returned
# The function uses "file" from value to determine filename
# it returns 'value' so that it can be passed to the return that aborts EWAS_QC
#warning(paste0("Error in file ", value$file, " : ", message))
print(paste0("Error in file ", value$file, " : ", message), quote = FALSE)
return(value)
}
outcome_QC <- integer(9L)
names(outcome_QC) <- c("input", "unusable", "removed_at_request", "chrX", "chrY", "below_lower_limit", "above_upper_limit", "removed", "final")
outcome_QC <- list(data_input = "",
file = "",
QC_success = FALSE,
lambda = -1,
p_cor = -1,
N = outcome_QC,
SE_median = -1,
mean_methylation = NULL,
effect_size = NULL)
# 0: Checking exclude & threshold outliers
if(!zf_testLogical(exclude_outliers)) stop("'exclude_outliers' is not a single logical value")
if(!(is.vector(threshold_outliers) & length(threshold_outliers) == 2L)) {
stop("'threshold_outliers' is not a numeric vector of length 2") }
use_outliers <- !all(is.na(threshold_outliers))
if(use_outliers){
if(!is.numeric(threshold_outliers)) stop("'threshold_outliers' is not numeric")
if(threshold_outliers[1] > threshold_outliers[2] & !is.na(threshold_outliers[1]) & !is.na(threshold_outliers[2])) {
threshold_outliers <- threshold_outliers[2:1]
}
} else {
if(exclude_outliers) stop("'exclude_outliers' is set to TRUE, but no thresholds have been specified")
}
# 0: Checking exclude_X & exclude_Y
if(!zf_testLogical(exclude_X)) stop("'exclude_X' is not a single logical value")
if(!zf_testLogical(exclude_Y)) stop("'exclude_Y' is not a single logical value")
if((exclude_X | exclude_Y) & !use_map) stop("No map has been specified to exclude X or Y chromosome markers")
# 0: checking markers to exclude
use_MTE <- if(missing(markers_to_exclude)) FALSE else !is.null(markers_to_exclude)
if(use_MTE){
if(is.character(markers_to_exclude) & length(markers_to_exclude) == 1L) {
if(file.exists(markers_to_exclude)){ markers_to_exclude <- read.table(markers_to_exclude, header = F, stringsAsFactors = FALSE)
} else { stop("Cannot find a file with the name specified in 'markers_to_exclude'")}
}
if(is.data.frame(markers_to_exclude)){ markers_to_exclude <- markers_to_exclude[,1]
} else {
if(!is.vector(markers_to_exclude)) stop("'markers_to_exclude' is not a vector or filename of a file containing a vector")
}
markers_to_exclude <- markers_to_exclude[!is.na(markers_to_exclude)]
if(length(markers_to_exclude) == 0L) stop("'markers_to_exclude' contains no non-missing values")
}
# 0: Checking save_final_dataset, return_beta & high_quality_plots
if(!zf_testLogical(save_final_dataset)) stop("'save_final_dataset' is not a single logical value")
if(!zf_testLogical(return_beta)) stop("'return_beta' is not a single logical value")
if(return_beta) stopifnot(is.integer(N_return_beta), is.vector(N_return_beta), length(N_return_beta) == 1L, N_return_beta > 50L)
if(!zf_testLogical(high_quality_plots)) stop("'high_quality_plots' is not a single logical value")
# 0: checking outputname & gzip_final_dataset
if(!(is.character(outputname) & length(outputname) == 1L)) stop("'outputname' is not a valid filename")
if(nchar(outputname) == 0L) stop("'outputname' is not a valid filename")
outcome_QC$file <- outputname # will be changed if save & gzip are TRUE
if(save_final_dataset){
if(!zf_testLogical(gzip_final_dataset)) stop("'gzip_final_dataset' is not a single logical value")
outcome_QC$file <- if(gzip_final_dataset) paste0(outputname, ".txt.gz") else paste0(outputname, ".txt")
if(file.exists(outcome_QC$file)) stop(paste0("Cannot save output file: ", outcome_QC$file, " already exists"))
if(length(header_final_dataset) == 1L) {
stopifnot(is.character(header_final_dataset), nchar(header_final_dataset) > 2L)
if(header_final_dataset %in% c("original", "standard")) {
use_outheader <- header_final_dataset
} else {
if(header_final_dataset %in% c("GWAMA", "PLINK", "META")) {
use_outheader <- header_final_dataset # temporary storage for command below
header_final_dataset <- data.frame(
GWAMA = c("MARKER", "CHR", "POSITION", "BETA", "SE", "P"),
PLINK = c("SNP", "CHR", "BP", "BETA", "SE", "P"),
META = c("rsid", "chr", "pos", "beta", "se", "P_value"),
QC = c("PROBEID","CHR", "POSITION", "BETA", "SE", "P_VAL"),
stringsAsFactors = FALSE)
header_final_dataset <- header_final_dataset[ ,c(use_outheader, "QC")]
} else {
if(!file.exists(header_final_dataset)) stop("'header_final_dataset' is not a standard name nor a filename in the working directory")
header_final_dataset <- read.table(header_final_dataset, header = F, stringsAsFactors = FALSE)
}
use_outheader <- "table"
}
} else {
if(!is.matrix(header_final_dataset) & !is.data.frame(header_final_dataset)) stop("'header_final_dataset' is not a table, filename or standard name")
use_outheader <- "table"
}
if(use_outheader == "table"){
if(is.matrix(header_final_dataset) | is.data.frame(header_final_dataset)) {
if(ncol(header_final_dataset) != 2L) { stop("'header_final_dataset' does not have 2 columns") }
if(any(is.na(header_final_dataset))) { stop("'header_final_dataset' contains missing values") }
if(!is.character(header_final_dataset[,1])) { stop("'header_final_dataset' contains non-character values") }
if(!is.character(header_final_dataset[,2])) { stop("'header_final_dataset' contains non-character values") }
if(any(duplicated(header_final_dataset[ ,1]),duplicated(header_final_dataset[ ,2]))) { stop("'header_final_dataset' contains duplicated names") }
} else {
stop("'header_final_dataset' is not a table, filename or standard name")
}
}
} else { gzip_final_dataset <- FALSE } # this is necessary for log
# 0: checking header translation table
use_translation <- !missing(header_translations)
if(use_translation) { use_translation <- !is.null(header_translations) }
if(use_translation) {
if(length(header_translations) == 1L) {
stopifnot(is.character(header_translations), nchar(header_translations) > 2L, file.exists(header_translations))
header_translations <- read.table(header_translations, stringsAsFactors = FALSE)
}
if(!is.data.frame(header_translations)) stop("'header_translations' is not dataframe or a filename leading to such")
if(ncol(header_translations) != 2L) stop("'header_translations' does not have two columns")
if(any(duplicated(header_translations[ ,2]))) stop("'header_translations' contains duplicated elements in column 2")
}
#### Stage 1: loading data & map & checking column names
print("", quote = FALSE)
print(paste0(" *** QC'ing ", outputname), quote = FALSE)
print("", quote = FALSE)
flush.console()
# 1: map
if(use_map){
if(is.character(map) & length(map) == 1L) {
if(!file.exists(map)) stop(paste0("Cannot find file: ", map))
map <- read.table(map, header = T, stringsAsFactors = FALSE, comment.char = "")
}
if(!is.data.frame(map)) stop("'map' is not a dataframe or a filename leading to such")
colnames(map) <- toupper(colnames(map))
if(!all(c("TARGETID", "CHR", "MAPINFO") %in% colnames(map))) {
stop(paste0("map does not have columns: ",
paste0(c("TARGETID", "CHR", "MAPINFO")[!c("TARGETID", "CHR", "MAPINFO") %in% colnames(map)],
collapse = ", ")))
}
# We cannot allow duplicate or missing probeIDs to be present in map,
# because otherwise temp_map won't be of equal length to data.
if(any(is.na(map$TARGETID))) stop("'map' contains missing TargetIDs")
if(any(duplicated(map$TARGETID))) stop("'map' contains duplicate TargetIDs")
}
# 1: data
if(is.character(data) & length(data) == 1L) {
if(!file.exists(data)) stop(paste0("Cannot find file: ", data))
outcome_QC$data_input <- data
data <- try(read.table(data, header = T, stringsAsFactors = FALSE, ...))
if(is(data, "try-error")) return(zf_emergencyExit(value = outcome_QC,
message = paste0("cannot load file ", outcome_QC$data_input, "; check warnings() for details")))
} else { outcome_QC$data_input <- "user-supplied table" }
if(!is.data.frame(data)) stop("'data' is not a dataframe or a filename leading to such")
# The above error won't occur for EWAS_series, as that loads all data via read.table
# hence we use stop isntead of the return function
header_orig <- colnames(data)
header_std <- c("PROBEID", "BETA", "SE", "P_VAL")
if(use_translation){
header_info <- translate_header(header = toupper(header_orig),
standard = c(header_std, "CHR", "POSITION"),
alternative = header_translations)
if(any(duplicated(header_info$header_h))) {
return(zf_emergencyExit(value = outcome_QC,
message = paste0("Duplicate columns: ", paste0(header_info$header_h[duplicated(header_info$header_h)], collapse = ", ") )))
}
colnames(data) <- header_info$header_h
} else { colnames(data) <- toupper(header_orig) }
if(!all(header_std %in% colnames(data))) {
return(zf_emergencyExit(value = outcome_QC,
message = paste0("Columns: ",
paste0(header_std[!header_std %in% colnames(data)], collapse = ", "),
" are missing from data. Data does contain (but could not identify) column(s) ", paste0(colnames(data)[!colnames(data) %in% header_std], collapse = ", "), "."
)))
}
#### Stage 2: data integrity
outcome_QC$N["input"] <- nrow(data)
if(outcome_QC$N["input"] == 0L) return(zf_emergencyExit(value = outcome_QC,
message = "'data' contains no entries"))
vec_unusable <- logical(outcome_QC$N["input"])
# 2: probeID - checking for duplicates
vec_NA_ID <- is.na(data$PROBEID)
N_NA_ID <- sum(vec_NA_ID)
if(N_NA_ID > 0L) { vec_unusable <- vec_unusable | vec_NA_ID }
vec_dupli <- duplicated(data$PROBEID)
N_dupli <- sum(vec_dupli)
if(N_dupli > 0L) {
temp_duplis <- data$PROBEID[vec_dupli]
N_dupli <- sum(data$PROBEID %in% temp_duplis)
rm(temp_duplis)
warning(paste0(N_dupli, " entries with duplicated IDs in file: ", outputname, "."))
}
# 2: Beta
vec_NA_beta <- is.na(data$BETA)
N_NA_beta <- sum(vec_NA_beta)
if(N_NA_beta == outcome_QC$N["input"]) return(zf_emergencyExit(value = outcome_QC,
message = "'data' contains no BETA values"))
if(!is.numeric(data$BETA)) return(zf_emergencyExit(value = outcome_QC,
message = "BETA column inside 'data' does not contain numeric values"))
if(N_NA_beta > 0L) { vec_unusable <- vec_unusable | vec_NA_beta }
# 2: SE - checking for negative values
vec_temp <- is.na(data$SE)
N_NA_SE <- sum(vec_temp)
if(N_NA_SE == outcome_QC$N["input"]) return(zf_emergencyExit(value = outcome_QC,
message = "'data' contains no SE values"))
if(!is.numeric(data$SE)) return(zf_emergencyExit(value = outcome_QC,
message = "SE column inside 'data' does not contain numeric values"))
if(N_NA_SE > 0L) { vec_unusable <- vec_unusable | vec_temp }
vec_temp <- data$SE <= 0 & !vec_temp # vec_temp represent missing values up until this point
N_bad_SE <- sum(vec_temp)
if(N_bad_SE > 0L) {
warning(paste0(N_bad_SE, " entries with a negative or zero standard-error in file: ", outputname, ". These values have been set to NA."))
data$SE[vec_temp] <- NA
vec_unusable <- vec_unusable | vec_temp
}
# 2: Pval - checking for missing, bad en p = 0
vec_temp <- is.na(data$P_VAL)
N_NA_p <- sum(vec_temp)
if(N_NA_p == outcome_QC$N["input"]) return(zf_emergencyExit(value = outcome_QC,
message = "'data' contains no p values"))
if(!is.numeric(data$P_VAL)) return(zf_emergencyExit(value = outcome_QC,
message = "P_VAL column inside 'data' does not contain numeric values"))
if(N_NA_p > 0L) { vec_unusable <- vec_unusable | vec_temp }
vec_temp <- (data$P_VAL < 0 | data$P_VAL > 1) & !vec_temp # vec_temp represent missing values up until this point
N_bad_p <- sum(vec_temp)
if(N_bad_p > 0L) {
warning(paste0(N_bad_p, " entries with p-values outside or the range 0 - 1 in file: ", outputname, ". These values have been set to NA."))
data$P_VAL[vec_temp] <- NA
vec_unusable <- vec_unusable | vec_temp
}
N_low_p <- sum(data$P_VAL < 1e-300 & !is.na(data$P_VAL))
if(N_low_p > 0L){
N_zero_p <- sum(data$P_VAL == 0 & !is.na(data$P_VAL))
if(N_zero_p > 0L) warning(paste0(N_zero_p, " entries with p = 0 in file: ", outputname, ". This indicates a rounding error."))
warning(paste0(N_low_p, " entries with p < 1e-300 in file: ", outputname, ". These values will be temporarily set to 1e-300 for graph creation."))
} else { N_zero_p <- 0L }
# 2: final check
rm(vec_temp)
outcome_QC$N["unusable"] <- sum(vec_unusable)
if(outcome_QC$N["unusable"] == outcome_QC$N["input"]) return(zf_emergencyExit(value = outcome_QC,
message = "All entries in 'data' contain missing or invalid values"))
#print(paste0(if(outcome_QC$N["unusable"] == 0L) "No" else outcome_QC$N["unusable"], " entries with missing values"), quote = FALSE)
print(paste0(outcome_QC$N["unusable"], " entries with missing values"), quote = FALSE)
#### Stage 3: data cleaning
# generate matching map for use in Manhattan plot
if(use_map) {
N_map <- sum(data$PROBEID %in% map$TARGETID)
if(N_map == 0L){
use_map <- FALSE
exclude_X <- FALSE
exclude_Y <- FALSE
warning(paste0("Cannot matchs probeIDs in file ", outputname, " with map. Check map or file probeIDs for problems."))
} else {
if(N_map < (outcome_QC$N["input"]-N_NA_ID) ) { warning(paste0((outcome_QC$N["input"] - N_NA_ID) - N_map, " entries in file ", outputname, " do not appear in the map.")) }
temp_map <- data.frame(data[, c("PROBEID", "P_VAL")], # only takes PROBEID and P_VAL to avoid issues with existing chr/position columns
order = 1:outcome_QC$N["input"],
stringsAsFactors = FALSE)
temp_map <- merge(temp_map, map[, c("TARGETID", "CHR", "MAPINFO")], by.x = "PROBEID", by.y = "TARGETID",
all.x = TRUE, all.y = FALSE, sort = FALSE)
temp_map <- temp_map[order(temp_map$order), -3]
if(!all(temp_map$PROBEID == data$PROBEID)) stop("DEBUG ERROR - temp_map does not match data-probeIDs")
temp_map$CHR <- ifelse(is.na(temp_map$CHR), -9, temp_map$CHR) # so we don't have to check for missing values in the following logic test
}
rm(map)
} else { N_map <- NA }
# 3: removing markers_to_exclude
if(use_MTE){
vec_remove <- data$PROBEID %in% markers_to_exclude
outcome_QC$N["removed_at_request"] <- sum(vec_remove)
} else {
vec_remove <- logical(outcome_QC$N["input"])
outcome_QC$N["removed_at_request"] <- NA
}
# 3: counting X & Y chr
if(use_map){ # the use_map test is so that use_map can be disabled above
# we di bit count probes that have already been excluded above
vec_temp <- ( temp_map$CHR == "X" | temp_map$CHR == 23 ) & !vec_remove
outcome_QC$N["chrX"] <- sum(vec_temp)
if(outcome_QC$N["chrX"] > 0L & exclude_X) vec_remove <- vec_remove | vec_temp
vec_temp <- ( temp_map$CHR == "Y" | temp_map$CHR == 24 ) & !vec_remove
outcome_QC$N["chrY"] <- sum(vec_temp)
if(outcome_QC$N["chrY"] > 0L & exclude_Y) vec_remove <- vec_remove | vec_temp
rm(vec_temp)
} else {
outcome_QC$N["chrX"] <- NA
outcome_QC$N["chrY"] <- NA
}
# 3: counting extremes
if(is.na(threshold_outliers[1])){
outcome_QC$N["below_lower_limit"] <- 0L
} else {
vec_temp <- data$BETA < threshold_outliers[1] & !vec_NA_beta & !vec_remove # this excludes any previously excluded markers
outcome_QC$N["below_lower_limit"] <- sum(vec_temp)
if(outcome_QC$N["below_lower_limit"] > 0L & exclude_outliers) vec_remove <- vec_remove| vec_temp
rm(vec_temp)
}
if(is.na(threshold_outliers[2])){
outcome_QC$N["above_upper_limit"] <- 0L
} else {
vec_temp <- data$BETA > threshold_outliers[2] & !vec_NA_beta & !vec_remove # this excludes any previously excluded markers
outcome_QC$N["above_upper_limit"] <- sum(vec_temp)
if(outcome_QC$N["above_upper_limit"] > 0L & exclude_outliers) vec_remove <- vec_remove| vec_temp
rm(vec_temp)
}
# 3: removing extremes & X & Y
outcome_QC$N["removed"] <- sum(vec_remove)
if(use_MTE) print(paste0(outcome_QC$N["removed_at_request"], " probes specified by user excluded"), quote = FALSE)
if(exclude_X) print(paste0(outcome_QC$N["chrX"], " X-chromosome probes excluded"), quote = FALSE)
if(exclude_Y) print(paste0(outcome_QC$N["chrY"], " Y-chromosome probes excluded"), quote = FALSE)
if(use_outliers){
print(paste0(outcome_QC$N["below_lower_limit"] + outcome_QC$N["above_upper_limit"], " outliers ", if(exclude_outliers) "excluded" else "found"), quote = FALSE)
}
if(sum(use_MTE, exclude_outliers, exclude_X, exclude_Y) > 1L) print(paste0(outcome_QC$N["removed"], " total probes excluded"), quote = FALSE)
flush.console()
if(outcome_QC$N["removed"] > 0L){
if(outcome_QC$N["removed"] == outcome_QC$N["input"]) return(zf_emergencyExit(value = outcome_QC,
message = "All entries have been removed from 'data'"))
data <- data[!vec_remove, ]
if(use_map) { temp_map <- temp_map[!vec_remove, ] }
vec_unusable <- vec_unusable[!vec_remove]
vec_NA_ID <- vec_NA_ID[!vec_remove]
vec_dupli <- vec_dupli[!vec_remove]
vec_NA_beta <- vec_NA_beta[!vec_remove]
if(all(vec_unusable)) return(zf_emergencyExit(value = outcome_QC,
message = "All entries in 'data' contain missing or invalid values"))
}
rm(vec_remove)
outcome_QC$N["final"] <- nrow(data)
#### Phase 4: generating plots
# 4: histograms
if(high_quality_plots){
tiff(filename = paste0(outputname, ".histo.tif"), units = "in", width = 9, height = 4.5, res = 350,
compression = "lzw")
} else {
png(paste0(outputname, ".histo.png"), width = 960, height = 480, res = 108)
}
par(mfrow = c(1, 2))
hist(data$BETA,
breaks = 30, freq = FALSE, col = "blue", plot = TRUE,
main = "Beta", xlab = "Beta", cex.main = 1.5)
hist(data$SE,
breaks = 30, freq = FALSE, col = "red", plot = TRUE,
main = "Standard error", xlab = "Standard error", cex.main = 1.5,
sub = outputname, cex.sub = 1.3)
dev.off()
# 4: P correlation plot
outcome_QC$p_cor <- P_correlation(dataset = data,
plot_correlation = TRUE, plot_if_threshold = FALSE, threshold_r = 0.99,
high_quality_plots = high_quality_plots, save_name = outputname)
# 4: Manhattan & QQ plots plot - note that we run this even if there is no map in order to calculate lamda
outcome_QC$lambda <- EWAS_plots(dataset = if(use_map) temp_map else data$P_VAL,
plot_QQ = TRUE, plot_Man = use_map, plot_cutoff_p = 0.05,
plot_QQ_bands = FALSE, high_quality_plots = high_quality_plots,
save_name = outputname)
if(use_map) rm(temp_map)
# 4: Volcano plot - Effect/Beta op X-as, y-as = -log10 p-waarde
temp_p <- if(N_low_p > 0L) ifelse(data$P_VAL < 1e-300, 1e-300, data$P_VAL) else data$P_VAL
temp_bonf <- 0.05/sum(!is.na(temp_p))
if(high_quality_plots){
tiff(filename = paste0(outputname, ".graph_volcano.tif"), units = "in", width = 7, height = 7, res = 350,
compression = "lzw")
} else {
png(paste0(outputname, ".graph_volcano.png"),
width = 720, height = 720, res = 144)
}
plot(data$BETA, -log10(temp_p),
main = "Volcano plot", sub = outputname, cex.sub = 1.3,
xlab = "Effect Size", ylab = "-log10(p value)")
if(min(temp_p, na.rm = T) < temp_bonf){
abline(h = -log10(temp_bonf), # -log10(5e-8), # Bonferroni threshold
lty = 3, col="red")
}
dev.off()
rm(temp_p)
# 4: Effect size against methylation beta (beta vs. beta-value) - ook even in de ijskast
#### 5 : generate output
# 5: update outcome_QC - this needs to occur before creation of the log file
outcome_QC$SE_median <- median(data$SE, na.rm = TRUE)
#outcome_QC$mean_methylation <- NULL # not implemented yet
if(return_beta) {
#N_return_beta <- 50000L
N_final_beta <- sum(!vec_NA_beta)
if(N_final_beta < N_return_beta) {
outcome_QC$effect_size <- c(data$BETA[!vec_NA_beta], rep(NA, N_return_beta - N_final_beta))
} else {
#outcome_QC$effect_size <- sort(data$BETA[!vec_NA_beta])[c(1:N_return_beta)*(N_final_beta %/% N_return_beta)]
outcome_QC$effect_size <- sort(sample(x = data$BETA[!vec_NA_beta], size = N_return_beta, replace = FALSE))
}
}
outcome_QC$QC_success <- TRUE
# 5: create log file - this needs to occur before the file headers are changed
print("", quote = FALSE)
print("", quote = FALSE)
write.table(c(
"****************************************",
"\t\tEWAS_QC LOG FILE",
"****************************************",
""), paste0(outputname, ".log"), append = FALSE,
quote = FALSE, row.names = FALSE, col.names = FALSE)
zc_log <- file(description = paste0(outputname, ".log"), open = "a")
write.table(data.frame(
v1 = c("File", "QC Start Time", "QC End Time", "EWAS_QC Version"),
v2 = "",
v3 = c(outcome_QC$file, zv_startime, date(), "1.2-3"),
stringsAsFactors = FALSE),
zc_log, quote = FALSE,
sep = "\t", row.names = FALSE, col.names = FALSE)
write.table(c("",
"",
"****************************************",
"\t1. Number of entries in file",
"****************************************",
""), zc_log, quote = FALSE,
row.names = FALSE, col.names = FALSE)
write.table(t(c("", "", "N", "", "", "", "N")),
zc_log, quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
write.table(data.frame(
v1 = c("Start", "ID Missing", "ID Duplicated", "Beta Missing", "SE Missing", "SE Invalid", "P Missing", "P Invalid", "Total missing & invalid"),
v2 = "",
v3 = c(outcome_QC$N["input"], N_NA_ID, N_dupli, N_NA_beta, N_NA_SE, N_bad_SE, N_NA_p, N_bad_p, outcome_QC$N["unusable"]),
v4 = "",
v5 = c("Start", "Removed at request", "Not found in map", "> X", "> Y", "Outliers High", "Outliers Low", "Total Removed", "Final"),
v6 = "",
v7 = c(outcome_QC$N["input"], outcome_QC$N["removed_at_request"], outcome_QC$N["input"] - N_map, outcome_QC$N["chrX"], outcome_QC$N["chrY"], outcome_QC$N["below_lower_limit"], outcome_QC$N["above_upper_limit"], outcome_QC$N["removed"], outcome_QC$N["final"]),
stringsAsFactors = FALSE),
zc_log, quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
write.table(c("",
"",
"****************************************",
"\t2. Summary statistics",
"****************************************",
""), zc_log, quote = FALSE,
row.names = FALSE, col.names = FALSE)
write.table(data.frame(
v1 = c("p-value correlation", "Lambda", "SE median"),
v2 = "",
v3 = c(outcome_QC$p_cor, outcome_QC$lambda, outcome_QC$SE_median),
stringsAsFactors = FALSE),
zc_log, quote = FALSE,
sep = "\t", row.names = FALSE, col.names = FALSE)
zf_saveStats <- function(stat, statname){ # we have to add name, otherwise c() will convert all integers to numeric
Nall <- length(stat)
NNA <- sum(is.na(stat))
PNA <- 100*NNA/Nall
if(PNA == 100) return(c(statname, Nall - NNA, NNA, round(PNA, digits = 1), rep(NA, 6)))
#return(c(statname, Nall - NNA, NNA, round(PNA, digits = 1),
# round(c(mean(stat, na.rm = TRUE), quantile(stat, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE, names = FALSE))[c(2,3,4,1,5,6)], digits = 2)))
return(c(statname, Nall - NNA, NNA, round(PNA, digits = 1),
c(mean(stat, na.rm = TRUE), quantile(stat, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE, names = FALSE))[c(2,3,4,1,5,6)]))
}
outcome_stats <- data.frame(stat = c("Beta", "StdErr", "P"), Nfinal = integer(3), NNA = integer(3), PNA = integer(3),
min = numeric(3), Q25 = numeric(3), median = numeric(3), mean = numeric(3), Q75 = numeric(3), max = numeric(3),
stringsAsFactors = FALSE)
outcome_stats[1, ] <- zf_saveStats(stat = data$BETA , statname = "Beta")
outcome_stats[2, ] <- zf_saveStats(stat = data$SE , statname = "StdErr")
outcome_stats[3, ] <- zf_saveStats(stat = data$P_VAL, statname = "P")
write.table("", zc_log, quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
write.table(t(c("" ,"N final", "N missing", "% missing", "min", "25%", "median", "mean", "75%", "max")),
zc_log, quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
write.table(outcome_stats,
zc_log, quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
close(zc_log)
# Saving final dataset
if(save_final_dataset) {
if(use_outheader == "original") colnames(data) <- header_orig
if(use_outheader == "table") {
temp_header <- translate_header(header = colnames(data), standard = header_final_dataset[ ,1], alternative = header_final_dataset)
colnames(data) <- temp_header$header_h
}
write.table(data,
if(gzip_final_dataset) gzfile(outcome_QC$file) else outcome_QC$file,
row.names = FALSE, quote = FALSE)
}
rm(data, vec_unusable, vec_NA_ID, vec_dupli, vec_NA_beta)
gc(FALSE, FALSE)
return(outcome_QC)
}
EWAS_series <- function(EWAS_files, # datatable with ewas results, or filename of the same
output_files,
map, # datatable with chr & pos of CpGs in idata or filename of the same
N, # sample sizes for precision plot
header_translations, # translation table for file headers
save_final_dataset = TRUE,
gzip_final_dataset = TRUE,
high_quality_plots = FALSE,
N_plot_beta = 500000L,
...){
# all other arguments will be checked by the first EWAS_QC, so no need to check 'em here
zf_loadAndReturn <- function(filename, filedesc, ...){
if(is.data.frame(filename) | is.matrix(filename)) return(filename)
if(!is.character(filename) | length(filename) != 1L) stop(paste0("argument '", filedesc, "' is neither a dataset nor a filename"))
if(!file.exists(filename)) stop(paste0("Cannot find file '", filename, "' in the working directory"))
return(read.table(file = filename, stringsAsFactors = FALSE, ...))
}
use_map <- !missing(map)
if(use_map) { use_map <- !is.null(map) }
use_translation <- !missing(header_translations)
if(use_translation) { use_translation <- !is.null(header_translations) }
use_N <- !missing(N)
if(use_N) { use_N <- !is.null(N) }
# Checking N_plot_beta
if(is.numeric(N_plot_beta)) {
if(as.integer(N_plot_beta) == N_plot_beta){
N_plot_beta <- as.integer(N_plot_beta)
} else stop("'N_plot_beta' is not an integer")
}
stopifnot(is.integer(N_plot_beta), is.vector(N_plot_beta), length(N_plot_beta) == 1L, N_plot_beta > 50L)
# checking input & output filenames, as well as save commands
stopifnot(is.vector(EWAS_files), is.character(EWAS_files))
if(any(nchar(EWAS_files) == 0L)) stop("'EWAS_files' contains invalid filenames")
if(any(duplicated(EWAS_files))) stop("duplicate filenames in 'EWAS_files'")
if(any(!file.exists(EWAS_files))) stop(paste0("files: ", paste0(EWAS_files[!file.exists(EWAS_files)], collapse = ", "), " do not exist"))
N_file <- length(EWAS_files)
if(missing(output_files)) { output_files <- paste0("QC_", EWAS_files)
} else {
stopifnot(is.vector(output_files), is.character(output_files))
if(N_file != length(output_files)) stop("vectors 'EWAS_files' (input) and 'output_files' are of unequal length")
}
# removing .gz
output_files <- ifelse(substr(output_files, nchar(output_files) - 2, nchar(output_files)) == ".gz",
substr(output_files, 1, nchar(output_files) - 3),
output_files)
if(any(nchar(output_files) == 0L)) stop("'output_files' contains valid filename")
if(any(duplicated(output_files))) stop("duplicate filenames in 'output_files'")
if(!zf_testLogical(save_final_dataset)) stop("'save_final_dataset' is not a single logical value")
if(save_final_dataset){
if(!zf_testLogical(gzip_final_dataset)) stop("'gzip_final_dataset' is not a single logical value")
#if(!zf_testLogical(save_standard_header)) stop("'save_standard_header' is not a single logical value")
if(gzip_final_dataset){
if(any(file.exists(paste0(output_files, ".gz")))) stop("cannot save output: one of the specified 'output_files' already exists")
} else {
if(any(file.exists(output_files))) stop("cannot save output: one of the specified 'output_files' already exists")
}
}
if(!zf_testLogical(high_quality_plots)) stop("'high_quality_plots' is not a single logical value")
# checking N & preparing precision plot
dat_prec <- data.frame(file = EWAS_files, no = 1:N_file,
use = logical(N_file), SE = numeric(N_file),
stringsAsFactors = FALSE)
if(use_N){
if(is.character(N) & length(N) == 1L) { N <- zf_loadAndReturn(filename = N, filedesc = "N", header = TRUE) }
if(!is.data.frame(N)) stop("'N' is not dataframe or a filename leading to such")
if(any(!colnames(N) %in% c("file", "N"))) stop("'N' does not contain the specified columns 'file' and/or 'N'")
if(any(!EWAS_files %in% N$file)) stop("Cannot find matching entries in 'N' for all files in 'EWAS_files'")
dat_prec <- merge(dat_prec, N[ , c("file", "N")], by = "file", all.x = TRUE, all.y = FALSE, sort = FALSE)
rm(N)
dat_prec <- dat_prec[order(dat_prec$no), ]
if(!is.numeric(dat_prec$N)) stop("column N in table 'N' does not contain numeric values")
if(any(dat_prec$N < 1, na.rm = TRUE)) stop("column N in table 'N' does not contain valid numeric values")
if(sum(!is.na(dat_prec$N)) < 2) {
warning("insufficient non-missing N values to generate precision plot")
use_N <- FALSE
}
}
# Checking header translation table
if(use_translation) {
header_translations <- zf_loadAndReturn(filename = header_translations, filedesc = "header_translations")
#if(!is.data.frame(header_translations)) stop("'header_translations' is not dataframe or a filename leading to such")
if(ncol(header_translations) != 2L) stop("'header_translations' does not have two columns")
if(any(duplicated(header_translations[ ,2]))) stop("'header_translations' contains duplicated elements in column 2")
} else { header_translations <- NULL }
# Checking map
if(use_map){
map <- zf_loadAndReturn(filename = map, filedesc = "map", header = TRUE)
#if(!is.data.frame(map)) stop("'map' is not a dataframe or a filename leading to such")
colnames(map) <- toupper(colnames(map))
if(!all(c("TARGETID", "CHR", "MAPINFO") %in% colnames(map))) {
stop(paste0("map does not have columns: ",
paste0(c("TARGETID", "CHR", "MAPINFO")[!c("TARGETID", "CHR", "MAPINFO") %in% colnames(map)],
collapse = ", ")))
}
# We cannot allow duplicate or missing TargetIDs to be present in map,
# because otherwise temp_map won't be of equal length to data.
if(any(is.na(map$TARGETID))) stop("'map' contains missing TargetIDs")
if(any(duplicated(map$TARGETID))) stop("'map' contains duplicate TargetIDs")
} else { map <- NULL }
#### Starting the actual QC
#N_beta <- 500000L
dat_beta <- matrix(data = numeric(N_plot_beta * N_file), nrow = N_plot_beta, ncol = N_file)
for(ci in 1:N_file){
#print(paste0(" *** QC'ing file ", ci, " of ", N_file), quote = FALSE)
#flush.console() # unnecessary as EWAS_QC will already flush
temp_out <- EWAS_QC(data = EWAS_files[ci],
map = map,
outputname = output_files[ci],
header_translations = header_translations,
save_final_dataset = save_final_dataset, gzip_final_dataset = gzip_final_dataset,
high_quality_plots = high_quality_plots,
return_beta = TRUE, N_return_beta = N_plot_beta, ...)
if(temp_out$QC_success){
dat_prec$use[ci] <- TRUE
dat_prec$SE[ci] <- temp_out$SE_median
# mean methylation is not used, for the moment
dat_beta[, ci] <- temp_out$effect_size
}
rm(temp_out)
}
print("", quote = FALSE)
# output:
N_pass <- sum(dat_prec$use)
if(N_pass > 1L){
#Precision plot
dat_temp <- dat_prec[dat_prec$use, ]
if(use_N){
temp_prec <- 1/dat_temp$SE
temp_N <- sqrt(dat_temp$N)
temp_ticks <- pretty(temp_prec) # using pretty() will hopefully ensure neat values on both y-axes
if(high_quality_plots){
tiff(filename = "EWAS_QC_graph_precision.tif", units = "in", width = 7, height = 7, res = 350,
compression = "lzw")
} else {
png("EWAS_QC_graph_precision.png", width = 960, height= 960, res = 144)
}
par(mar = c(5, 4, 4, 4.5) + 0.1) # add +2 to right margin, so to leave space for label
plot(x = temp_N, y = temp_prec, yaxt = "n", # suppresses the y-axis
main = "Precision by Sample Size", xlab = "sqrt(sample size)", ylab = "Precision ( = 1 / median SE)")
axis(side = 2, at = temp_ticks, labels = TRUE)
axis(side = 4, at = temp_ticks, labels = 1/temp_ticks)
mtext(text = "median SE", side = 4, line = 3) # adds label to right margin
text(temp_N, temp_prec, labels = dat_temp$no, pos = 4)
dev.off()
rm(temp_N, temp_prec, temp_ticks)
}
#effect size boxplot
dat_beta <- dat_beta[ , dat_prec$use]
if(use_N){
dat_beta <- dat_beta[ , order(dat_temp$N, na.last = FALSE)]
dat_temp <- dat_temp[order(dat_temp$N, na.last = FALSE), ]
}
dat_quant <- matrix(data = 0.0, ncol = 3, nrow = N_pass,
dimnames = list(NULL, c("quantile25", "median", "quantile75")))
for(ti in 1:ncol(dat_beta)) dat_quant[ti, ] <- quantile(dat_beta[, ti], names = FALSE, na.rm = TRUE)[2:4]
if(high_quality_plots){
tiff(filename = "EWAS_QC_graph_effectsize.tif", units = "in", width = 6 + 1 * N_pass, height = 7, res = 350,
compression = "lzw")
} else {
png("EWAS_QC_graph_effectsize.png", width = 800 + 160 * N_pass, height = 960, res = 144)
}
boxplot(dat_beta, names = if(use_N) paste0(dat_temp$no, ", N=", dat_temp$N) else dat_temp$no,
las = 0, main = "Effect-size distribution")
abline(h = c(median(dat_quant[ ,1]), median(dat_quant[ ,2]), median(dat_quant[ ,3])), lty = 3)
dev.off()
rm(dat_temp)
} else { print("Insufficient files passed the QC to generate graphs", quote = FALSE)}
rm(dat_beta, map)
write.table(dat_prec, "EWAS_QC_legend.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
gc(FALSE, FALSE)
return(invisible(dat_prec))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.